From 7174cc2930b3463c01d0dbd6a5a769b6803e2ef7 Mon Sep 17 00:00:00 2001
From: Shaun Patterson <shaunpatterson@gmail.com>
Date: Sat, 23 May 2026 14:30:54 -0400
Subject: [PATCH 1/8] bench: add dgraph-shaped iterator micro-benchmarks

Add four benchmarks modeling dgraph's posting-layer iterator hot
paths. dgraph drives badger in a specific combination of options
that none of the existing root-package benchmarks exercise:

  - NamespaceOffset = 1 (DB.isBanned runs on every key)
  - NumVersionsToKeep = math.MaxInt32 (all versions retained)
  - DetectConflicts = false (dgraph owns OCC)
  - PrefetchValues = false (key-only scans in every hot path)
  - AllVersions = true (MVCC-aware reads)
  - Prefix-bounded forward iteration over mid-sized structured keys

Keys mirror dgraph's data-key layout (1B type + 8B namespace + 2B
attr-len + ~20B attr + 1B subtype + 8B UID = 40 bytes).

Benchmarks:

  - BenchmarkDgraphPrefixScanKeyOnly: models has() / index-scan
    paths (posting.MemoryLayer.IterateDisk, worker.sort.go)
  - BenchmarkDgraphPrefixScanAllVersions: models the rollup
    iterator with multiple MVCC versions per key
  - BenchmarkDgraphKeyIteratorAllVersions: models readFromDisk
    via NewKeyIterator over a single posting key (every cache miss)
  - BenchmarkDgraphPrefixScanNamespaceOffset: isolates the cost of
    NamespaceOffset=1 vs -1 (i.e. the per-key DB.isBanned tax)

These benchmarks are intended as the measurement baseline for
upcoming iterator hot-path optimizations.
---
 iterator_dgraph_bench_test.go | 305 ++++++++++++++++++++++++++++++++++
 1 file changed, 305 insertions(+)
 create mode 100644 iterator_dgraph_bench_test.go

diff --git a/iterator_dgraph_bench_test.go b/iterator_dgraph_bench_test.go
new file mode 100644
index 000000000..c890709a3
--- /dev/null
+++ b/iterator_dgraph_bench_test.go
@@ -0,0 +1,305 @@
+/*
+ * SPDX-FileCopyrightText: © 2017-2025 Istari Digital, Inc.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package badger
+
+import (
+	"encoding/binary"
+	"math"
+	"math/rand"
+	"os"
+	"testing"
+
+	"github.com/dgraph-io/badger/v4/y"
+)
+
+// dgraph-shaped iterator micro-benchmarks.
+//
+// These benchmarks model the hot iterator paths in dgraph's posting layer
+// (posting/mvcc.go: ReadPostingList, IterateDisk, sort.go: index bucket scan).
+// They are intentionally distinct from the existing root-package benchmarks
+// because none of those exercise the combination dgraph actually uses:
+//
+//   - NamespaceOffset = 1 (so DB.isBanned runs on every key)
+//   - NumVersionsToKeep = math.MaxInt32 (dgraph keeps all versions)
+//   - DetectConflicts = false (dgraph owns OCC)
+//   - PrefetchValues = false (key-only scans in hot paths)
+//   - AllVersions = true (MVCC-aware reads in ReadPostingList / rollup)
+//   - Prefix-bounded scans over mid-sized structured keys
+//
+// Key layout mirrors dgraph's data keys (see x/keys.go in dgraph):
+//
+//   byte 0      : type prefix (0x00 = data)
+//   bytes 1..8  : namespace, big-endian uint64
+//   bytes 9..10 : 2-byte attr length, big-endian uint16
+//   bytes 11..  : attr name (variable, ~16 bytes typical)
+//   next 1 byte : subtype
+//   last 8 bytes: UID, big-endian
+//
+// Total: 1 + 8 + 2 + len(attr) + 1 + 8 ≈ 36 bytes for a 16-byte attr.
+
+const (
+	dgKeyTypeData = 0x00
+	dgAttrName    = "predicate_namespaced" // 20 bytes -> 40-byte keys
+)
+
+// dgKey constructs a dgraph-shaped data key with the given namespace, attr,
+// subtype, and UID.
+func dgKey(ns uint64, attr string, subtype byte, uid uint64) []byte {
+	buf := make([]byte, 1+8+2+len(attr)+1+8)
+	buf[0] = dgKeyTypeData
+	binary.BigEndian.PutUint64(buf[1:9], ns)
+	binary.BigEndian.PutUint16(buf[9:11], uint16(len(attr)))
+	copy(buf[11:11+len(attr)], attr)
+	buf[11+len(attr)] = subtype
+	binary.BigEndian.PutUint64(buf[12+len(attr):], uid)
+	return buf
+}
+
+// dgPrefix returns the (ns + attr) prefix shared by all keys for one predicate.
+// This is what dgraph iterates with for prefix-bounded scans.
+func dgPrefix(ns uint64, attr string) []byte {
+	buf := make([]byte, 1+8+2+len(attr))
+	buf[0] = dgKeyTypeData
+	binary.BigEndian.PutUint64(buf[1:9], ns)
+	binary.BigEndian.PutUint16(buf[9:11], uint16(len(attr)))
+	copy(buf[11:], attr)
+	return buf
+}
+
+// dgraphTestOptions returns the badger options that match dgraph's
+// production configuration: managed DB, all-versions retention, no conflict
+// detection, namespace offset = 1.
+func dgraphTestOptions(dir string) Options {
+	return DefaultOptions(dir).
+		WithSyncWrites(false).
+		WithLoggingLevel(WARNING).
+		WithNumVersionsToKeep(math.MaxInt32).
+		WithDetectConflicts(false).
+		WithNamespaceOffset(1)
+}
+
+// dgraphLoadDB populates a managed DB with `nKeys` unique keys under a single
+// predicate, each carrying `versionsPerKey` MVCC versions. UIDs are dense
+// (1..nKeys) so prefix scans land contiguous keys, mirroring dgraph's
+// `has()` predicate scans.
+func dgraphLoadDB(b *testing.B, db *DB, ns uint64, attr string, nKeys, versionsPerKey int) {
+	b.Helper()
+	// Small value: realistic for posting-list deltas (a few hundred bytes is
+	// the upper end; we use 64 bytes to keep total disk footprint bounded).
+	val := make([]byte, 64)
+	for i := range val {
+		val[i] = byte(i)
+	}
+
+	const batch = 4000
+	commitTs := uint64(1)
+	for start := 0; start < nKeys; start += batch {
+		end := start + batch
+		if end > nKeys {
+			end = nKeys
+		}
+		// Write `versionsPerKey` versions of each key in this batch.
+		for v := 0; v < versionsPerKey; v++ {
+			txn := db.NewTransactionAt(math.MaxUint64, true)
+			for i := start; i < end; i++ {
+				key := dgKey(ns, attr, 0, uint64(i+1))
+				if err := txn.SetEntry(&Entry{Key: key, Value: val}); err != nil {
+					b.Fatalf("SetEntry: %v", err)
+				}
+			}
+			if err := txn.CommitAt(commitTs, nil); err != nil {
+				b.Fatalf("CommitAt: %v", err)
+			}
+			commitTs++
+		}
+	}
+	// Force everything to disk so iteration reflects the on-disk format
+	// (avoids measuring only the memtable hit).
+	if err := db.Flatten(2); err != nil {
+		b.Fatalf("Flatten: %v", err)
+	}
+}
+
+// openDgraphDB opens a managed DB with the dgraph-shaped options.
+func openDgraphDB(b *testing.B) (*DB, string) {
+	b.Helper()
+	dir, err := os.MkdirTemp(".", "badger-dgraph-bench")
+	y.Check(err)
+	db, err := OpenManaged(dgraphTestOptions(dir))
+	y.Check(err)
+	return db, dir
+}
+
+// BenchmarkDgraphPrefixScanKeyOnly models the IterateDisk / index-scan path:
+// prefix-bounded forward iteration with PrefetchValues=false, one version per
+// key. This is what dgraph's `has()` predicate evaluator and sort.go index
+// scans drive.
+func BenchmarkDgraphPrefixScanKeyOnly(b *testing.B) {
+	const (
+		ns    = uint64(0x0102030405060708)
+		nKeys = 200_000
+	)
+	db, dir := openDgraphDB(b)
+	defer func() { db.Close(); removeDir(dir) }()
+	dgraphLoadDB(b, db, ns, dgAttrName, nKeys, 1)
+
+	prefix := dgPrefix(ns, dgAttrName)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		txn := db.NewTransactionAt(math.MaxUint64, false)
+		opt := DefaultIteratorOptions
+		opt.Prefix = prefix
+		opt.PrefetchValues = false
+		it := txn.NewIterator(opt)
+		count := 0
+		for it.Rewind(); it.Valid(); it.Next() {
+			count++
+		}
+		it.Close()
+		txn.Discard()
+		if count != nKeys {
+			b.Fatalf("expected %d keys, got %d", nKeys, count)
+		}
+	}
+	b.ReportMetric(float64(nKeys), "keys/op")
+}
+
+// BenchmarkDgraphPrefixScanAllVersions models the rollup path:
+// prefix-bounded forward iteration with PrefetchValues=false and
+// AllVersions=true, where each key has several MVCC versions.
+func BenchmarkDgraphPrefixScanAllVersions(b *testing.B) {
+	const (
+		ns             = uint64(0x0102030405060708)
+		nKeys          = 50_000
+		versionsPerKey = 5
+	)
+	db, dir := openDgraphDB(b)
+	defer func() { db.Close(); removeDir(dir) }()
+	dgraphLoadDB(b, db, ns, dgAttrName, nKeys, versionsPerKey)
+
+	prefix := dgPrefix(ns, dgAttrName)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		txn := db.NewTransactionAt(math.MaxUint64, false)
+		opt := DefaultIteratorOptions
+		opt.Prefix = prefix
+		opt.PrefetchValues = false
+		opt.AllVersions = true
+		it := txn.NewIterator(opt)
+		count := 0
+		for it.Rewind(); it.Valid(); it.Next() {
+			count++
+		}
+		it.Close()
+		txn.Discard()
+		// Should see nKeys * versionsPerKey items.
+		if count != nKeys*versionsPerKey {
+			b.Fatalf("expected %d items, got %d", nKeys*versionsPerKey, count)
+		}
+	}
+	b.ReportMetric(float64(nKeys*versionsPerKey), "items/op")
+}
+
+// BenchmarkDgraphKeyIteratorAllVersions models readFromDisk:
+// a NewKeyIterator over a single posting key with AllVersions=true. dgraph
+// invokes this on every cache miss in the posting layer.
+func BenchmarkDgraphKeyIteratorAllVersions(b *testing.B) {
+	const (
+		ns             = uint64(0x0102030405060708)
+		nKeys          = 20_000
+		versionsPerKey = 8
+	)
+	db, dir := openDgraphDB(b)
+	defer func() { db.Close(); removeDir(dir) }()
+	dgraphLoadDB(b, db, ns, dgAttrName, nKeys, versionsPerKey)
+
+	// Pre-build the lookup key set so the random selection cost stays out
+	// of the measured loop.
+	keys := make([][]byte, nKeys)
+	for i := 0; i < nKeys; i++ {
+		keys[i] = dgKey(ns, dgAttrName, 0, uint64(i+1))
+	}
+	rng := rand.New(rand.NewSource(1))
+
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		key := keys[rng.Intn(nKeys)]
+		txn := db.NewTransactionAt(math.MaxUint64, false)
+		opt := DefaultIteratorOptions
+		opt.PrefetchValues = false
+		opt.AllVersions = true
+		it := txn.NewKeyIterator(key, opt)
+		count := 0
+		for it.Seek(key); it.Valid(); it.Next() {
+			count++
+		}
+		it.Close()
+		txn.Discard()
+		if count != versionsPerKey {
+			b.Fatalf("expected %d versions, got %d", versionsPerKey, count)
+		}
+	}
+}
+
+// BenchmarkDgraphPrefixScanNamespaceOffset measures the cost of
+// NamespaceOffset=1 vs NamespaceOffset=-1 (off) under identical workload.
+// This isolates the per-key cost of DB.isBanned, which today takes an
+// RWMutex on every iterator step even when no namespaces are banned.
+func BenchmarkDgraphPrefixScanNamespaceOffset(b *testing.B) {
+	const (
+		ns    = uint64(0x0102030405060708)
+		nKeys = 100_000
+	)
+	cases := []struct {
+		name   string
+		nsOff  int // -1 = off, 1 = dgraph-style
+	}{
+		{"off", -1},
+		{"dgraph", 1},
+	}
+
+	for _, c := range cases {
+		b.Run(c.name, func(b *testing.B) {
+			dir, err := os.MkdirTemp(".", "badger-dgraph-nsoff")
+			y.Check(err)
+			defer removeDir(dir)
+			opts := DefaultOptions(dir).
+				WithSyncWrites(false).
+				WithLoggingLevel(WARNING).
+				WithNumVersionsToKeep(math.MaxInt32).
+				WithDetectConflicts(false).
+				WithNamespaceOffset(c.nsOff)
+			db, err := OpenManaged(opts)
+			y.Check(err)
+			defer db.Close()
+			dgraphLoadDB(b, db, ns, dgAttrName, nKeys, 1)
+
+			prefix := dgPrefix(ns, dgAttrName)
+			b.ResetTimer()
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				txn := db.NewTransactionAt(math.MaxUint64, false)
+				opt := DefaultIteratorOptions
+				opt.Prefix = prefix
+				opt.PrefetchValues = false
+				it := txn.NewIterator(opt)
+				count := 0
+				for it.Rewind(); it.Valid(); it.Next() {
+					count++
+				}
+				it.Close()
+				txn.Discard()
+				if count != nKeys {
+					b.Fatalf("expected %d keys, got %d", nKeys, count)
+				}
+			}
+		})
+	}
+}
+

From 4942ce1c7362c5a0ee2b1a8a5ca17fcf32559bd2 Mon Sep 17 00:00:00 2001
From: Shaun Patterson <shaunpatterson@gmail.com>
Date: Sat, 23 May 2026 14:55:14 -0400
Subject: [PATCH 2/8] perf: lockless fast path for DB.isBanned when no
 namespaces banned

DB.isBanned runs on every iterator step and every key access when
NamespaceOffset >= 0. The hot path acquires lockedKeys.RLock and probes
the keys map even when the ban set is empty (the steady-state case in
production deployments like dgraph that enable NamespaceOffset for
multi-tenant key isolation but rarely ban anything).

Add a monotonic atomic.Bool flag (lockedKeys.hasAny) that flips from
false to true on the first add() and never flips back. isBanned can
check this flag with a single atomic load and skip the lock + map
lookup when no ban has ever been recorded.

Correctness: add() stores the flag inside the same lock that guards
the keys map, so any reader observing hasAny=true is guaranteed to
see the populated map via release-acquire ordering on the lock.
Readers observing hasAny=false correctly return early because the
flag is monotonic and is set before any key is observable in the map.
DB-reopen replay path also calls add() so reopened DBs retain the
flag for previously-recorded bans.

Measured on dgraph-shaped iterator benchmarks (Apple M4 Max,
median-of-3, -benchtime=5s):

  PrefixScanKeyOnly        11.80M -> 10.56M ns/op  (-10.5%)
  PrefixScanAllVersions    11.98M -> 10.60M ns/op  (-11.4%)
  KeyIteratorAllVersions    1,514 ->  1,371 ns/op   (-9.4%)
  NamespaceOffset/dgraph    5.51M ->  4.96M ns/op  (-10.0%)
  NamespaceOffset/off       5.08M ->  5.01M ns/op   (-1.4%, noise)

The NamespaceOffset/dgraph case now matches NamespaceOffset/off
(both ~4.96-5.01M ns/op) -- the entire ~8.5% NamespaceOffset tax
from the per-key RLock is eliminated.

Tests: full suite passes (TestBannedPrefixes, TestIterateWithBanned
verified) except TestProtosRegenerate which fails identically on
baseline due to missing protoc binary.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 db.go | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/db.go b/db.go
index 8a1021308..129607a32 100644
--- a/db.go
+++ b/db.go
@@ -52,12 +52,19 @@ type closers struct {
 type lockedKeys struct {
 	sync.RWMutex
 	keys map[uint64]struct{}
+	// hasAny is a fast-path flag: false until the first add(), then true forever.
+	// Hot-path callers (DB.isBanned) check this without taking the lock so the
+	// common case (empty ban set) costs one atomic load instead of an
+	// RLock/RUnlock pair plus a map lookup. There is no remove API, so the
+	// flag is monotonic and never needs to flip back to false.
+	hasAny atomic.Bool
 }
 
 func (lk *lockedKeys) add(key uint64) {
 	lk.Lock()
 	defer lk.Unlock()
 	lk.keys[key] = struct{}{}
+	lk.hasAny.Store(true)
 }
 
 func (lk *lockedKeys) has(key uint64) bool {
@@ -1846,6 +1853,14 @@ func (db *DB) isBanned(key []byte) error {
 	if db.opt.NamespaceOffset < 0 {
 		return nil
 	}
+	// Fast path: no namespaces have ever been banned in this DB lifetime
+	// (the common production case). Skip the slice + lookup + lock entirely.
+	// isBanned is called on every iterator step and every Txn.Get/modify, so
+	// avoiding the RLock here matters when NamespaceOffset is enabled but no
+	// bans are active.
+	if !db.bannedNamespaces.hasAny.Load() {
+		return nil
+	}
 	if len(key) <= db.opt.NamespaceOffset+8 {
 		return nil
 	}

From 41018baa97332bebf6bb62736a627a58d371d3af Mon Sep 17 00:00:00 2001
From: Shaun Patterson <shaunpatterson@gmail.com>
Date: Sat, 23 May 2026 15:14:49 -0400
Subject: [PATCH 3/8] perf: hoist mi.Value() and key out of fill() to skip
 per-item double-decode

Iterator.parseItem previously called mi.Value() twice per kept item on
the non-AllVersions hot path:
  1. To pre-check Meta/ExpiresAt for the deleted/expired filter.
  2. Inside fill(), which re-fetched the same ValueStruct (cost includes
     an Arena getVal + ValueStruct.Decode that the CPU profile showed
     consuming ~40% of cumulative samples on dgraph-shaped key-only
     prefix scans).

Refactor fill to take the already-fetched key []byte and a pointer to
the already-decoded *y.ValueStruct, and hoist mi.Value() into both the
AllVersions and non-AllVersions branches of parseItem. fill no longer
touches it.iitr.

Pass vs by pointer (not value): y.ValueStruct is ~40 bytes; passing by
value added a per-item struct copy that wiped out the AllVersions win
(branch only had one Value() call, no double-decode to eliminate).
Pointer collapses to a single 8-byte arg; fill does not store the
pointer so escape analysis keeps vs on the stack.

Correctness:
- vs and key are fetched while the iterator is positioned on the
  current entry; no mi.Next() runs between fetch and fill. Arena/block
  memory backing vs.Value and key is stable for that window.
- In the reverse-iteration goto FILL path, mi has advanced once, so
  the refactor refreshes key = mi.Key() before re-entering FILL, and
  FILL re-fetches vs := mi.Value(). Semantically equivalent to the
  pre-refactor code.

Measured on dgraph-shaped iterator benchmarks (Apple M4 Max,
median-of-3, -benchtime=5s) -- composite (sum of 5 medians) drops
31.13M -> 26.37M ns (-15.3% vs iter1, -23.3% vs upstream baseline):

  PrefixScanKeyOnly        10.56M -> 8.18M ns/op   (-22.5%)
  PrefixScanAllVersions    10.60M -> 10.53M ns/op  (-0.7%, flat)
  KeyIteratorAllVersions    1,371 ->  1,376 ns/op  (+0.4%, noise)
  NamespaceOffset/off       5.01M -> 3.83M ns/op   (-23.6%)
  NamespaceOffset/dgraph    4.96M -> 3.83M ns/op   (-22.7%)

Tests: full suite passes (TestProtosRegenerate continues to fail
identically due to missing protoc binary).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 iterator.go | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/iterator.go b/iterator.go
index f57cfa4c9..ac4c72648 100644
--- a/iterator.go
+++ b/iterator.go
@@ -640,8 +640,9 @@ func (it *Iterator) parseItem() bool {
 	if it.opt.AllVersions {
 		// Return deleted or expired values also, otherwise user can't figure out
 		// whether the key was deleted.
+		vs := mi.Value()
 		item := it.newItem()
-		it.fill(item)
+		it.fill(item, key, &vs)
 		setItem(item)
 		mi.Next()
 		return true
@@ -659,11 +660,13 @@ func (it *Iterator) parseItem() bool {
 		// Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a.
 		// Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5,
 		// which is wrong. Therefore, update lastKey here.
-		it.lastKey = y.SafeCopy(it.lastKey, mi.Key())
+		it.lastKey = y.SafeCopy(it.lastKey, key)
 	}
 
 FILL:
-	// If deleted, advance and return.
+	// If deleted, advance and return. We fetch vs (and reuse key from the
+	// enclosing scope, or refetch after Next on the reverse goto below) so
+	// that fill does not have to call mi.Value() / mi.Key() a second time.
 	vs := mi.Value()
 	if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
 		mi.Next()
@@ -671,7 +674,7 @@ FILL:
 	}
 
 	item := it.newItem()
-	it.fill(item)
+	it.fill(item, key, &vs)
 	// fill item based on current cursor position. All Next calls have returned, so reaching here
 	// means no Next was called.
 
@@ -681,9 +684,11 @@ FILL:
 		return true
 	}
 
-	// Reverse direction.
-	nextTs := y.ParseTs(mi.Key())
-	mik := y.ParseKey(mi.Key())
+	// Reverse direction. Refresh key after the Next() above; the iterator
+	// has advanced, so the previous `key` slice now refers to a later block.
+	key = mi.Key()
+	nextTs := y.ParseTs(key)
+	mik := y.ParseKey(key)
 	if nextTs <= it.readTs && bytes.Equal(mik, item.key) {
 		// This is a valid potential candidate.
 		goto FILL
@@ -693,14 +698,18 @@ FILL:
 	return true
 }
 
-func (it *Iterator) fill(item *Item) {
-	vs := it.iitr.Value()
+// fill populates item from the current iterator position. Callers pass the
+// already-fetched key and value pointer to avoid the per-item cost of
+// calling mi.Key() / mi.Value() (and decoding ValueStruct) a second time
+// on the hot iterator path. vs is passed by pointer to avoid copying the
+// ~40-byte ValueStruct on every kept item.
+func (it *Iterator) fill(item *Item, key []byte, vs *y.ValueStruct) {
 	item.meta = vs.Meta
 	item.userMeta = vs.UserMeta
 	item.expiresAt = vs.ExpiresAt
 
-	item.version = y.ParseTs(it.iitr.Key())
-	item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key()))
+	item.version = y.ParseTs(key)
+	item.key = y.SafeCopy(item.key, y.ParseKey(key))
 
 	item.vptr = y.SafeCopy(item.vptr, vs.Value)
 	item.val = nil

From 9267dbb608358b317914691ae95220a48397ee84 Mon Sep 17 00:00:00 2001
From: Shaun Patterson <shaunpatterson@gmail.com>
Date: Sat, 23 May 2026 15:22:14 -0400
Subject: [PATCH 4/8] perf: skip per-step internal-key probe when prefix
 excludes badgerPrefix

Iterator.parseItem unconditionally ran bytes.HasPrefix(key, badgerPrefix)
on every step to detect badger-internal keys (e.g. !badger!banned). For
prefix-bounded scans whose Prefix's first byte differs from
badgerPrefix[0] ('!'), no key the iterator can land on can possibly be
internal -- the probe is dead. The CPU profile after iter2 attributed
~13% of cumulative samples to runtime.memequal, a portion of which was
this probe.

Precompute canSeeInternalKeys once at iterator construction:
  - empty prefix: true (unbounded scan can hit internal keys)
  - prefix[0] == '!' : true (could be the banned-ns recovery iterator
    or any scan that starts in the internal-key range)
  - otherwise: false (the iterator after Seek(prefix) is positioned
    past the internal-key range and hasPrefix() will reject any
    !-prefixed key anyway, so the per-step probe is provably dead).

When the flag is false, skip both the HasPrefix probe and the
!InternalAccess skip-block. isInternalKey defaults to false, which is
also the correct value for the downstream "!isInternalKey && isBanned"
gate (non-internal keys are subject to ban checks; internal ones are
not, but here we know they can't appear).

Correctness:
- initBannedNamespaces sets Prefix = bannedNsKey (starts with '!') so
  the flag is true and the probe still runs on the recovery path.
- All other badger-internal access uses InternalAccess=true with a
  badgerPrefix-anchored Prefix, so the flag is also true there.
- For dgraph's data scans (Prefix[0] = 0x00) and all other user scans
  the probe is provably dead and can be skipped.

Measured on dgraph-shaped iterator benchmarks (Apple M4 Max,
median-of-3, -benchtime=5s):

  PrefixScanKeyOnly        8.18M -> 7.92M ns/op  (-3.2%)
  PrefixScanAllVersions   10.53M ->10.50M ns/op  (-0.2%, noise)
  KeyIteratorAllVersions  1,376  -> 1,350 ns/op  (-1.9%)
  NamespaceOffset/off      3.83M -> 3.72M ns/op  (-2.6%)
  NamespaceOffset/dgraph   3.83M -> 3.74M ns/op  (-2.4%)

Cumulative since upstream baseline: composite 34.37M -> 25.89M ns
(-24.7%); PrefixScanKeyOnly 11.80M -> 7.92M (-32.9%);
NamespaceOffset/dgraph 5.51M -> 3.74M (-32.1%).

Tests: TestBannedPrefixes, TestIterateWithBanned, TestIterator*, and
TestKeyIterator* all pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 iterator.go | 43 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/iterator.go b/iterator.go
index ac4c72648..d63ac2b09 100644
--- a/iterator.go
+++ b/iterator.go
@@ -435,6 +435,12 @@ type Iterator struct {
 
 	lastKey []byte // Used to skip over multiple versions of the same key.
 
+	// canSeeInternalKeys is true when this iterator can possibly surface a
+	// badger-internal key (e.g. "!badger!banned"). When false, parseItem
+	// can skip the per-step bytes.HasPrefix(key, badgerPrefix) check.
+	// Computed once at construction from opt.Prefix.
+	canSeeInternalKeys bool
+
 	closed  bool
 	scanned int // Used to estimate the size of data scanned by iterator.
 
@@ -482,14 +488,26 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator {
 	}
 	iters = txn.db.lc.appendIterators(iters, &opt) // This will increment references.
 	res := &Iterator{
-		txn:    txn,
-		iitr:   table.NewMergeIterator(iters, opt.Reverse),
-		opt:    opt,
-		readTs: txn.readTs,
+		txn:               txn,
+		iitr:              table.NewMergeIterator(iters, opt.Reverse),
+		opt:               opt,
+		readTs:            txn.readTs,
+		canSeeInternalKeys: canSeeInternalKeys(opt.Prefix),
 	}
 	return res
 }
 
+// canSeeInternalKeys reports whether an iterator with the given prefix can
+// possibly surface a badger-internal key. Internal keys all live under the
+// fixed badgerPrefix; if the user's prefix exists and starts with a
+// different byte than badgerPrefix[0], no internal key can match.
+func canSeeInternalKeys(prefix []byte) bool {
+	if len(prefix) == 0 {
+		return true
+	}
+	return prefix[0] == badgerPrefix[0]
+}
+
 // NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a
 // single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to
 // additionally run bloom filter lookups before picking tables from the LSM tree.
@@ -616,11 +634,18 @@ func (it *Iterator) parseItem() bool {
 		}
 	}
 
-	isInternalKey := bytes.HasPrefix(key, badgerPrefix)
-	// Skip badger keys.
-	if !it.opt.InternalAccess && isInternalKey {
-		mi.Next()
-		return false
+	// Detect badger-internal keys. When canSeeInternalKeys is false (the
+	// common case for prefix-bounded user scans whose prefix cannot collide
+	// with badgerPrefix), we know the current key cannot be internal and
+	// elide the per-step bytes.HasPrefix(key, badgerPrefix) probe.
+	var isInternalKey bool
+	if it.canSeeInternalKeys {
+		isInternalKey = bytes.HasPrefix(key, badgerPrefix)
+		// Skip badger keys.
+		if !it.opt.InternalAccess && isInternalKey {
+			mi.Next()
+			return false
+		}
 	}
 
 	// Skip any versions which are beyond the readTs.

From 1cb43e70123fdda96e06aa89b4e07ba12add4212 Mon Sep 17 00:00:00 2001
From: Shaun Patterson <shaunpatterson@gmail.com>
Date: Sat, 23 May 2026 15:35:40 -0400
Subject: [PATCH 5/8] perf(iterator): IteratorOptions.KeyOnly skip per-item
 value SafeCopy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add opt-in KeyOnly flag to IteratorOptions. When set, fill() skips the
per-item SafeCopy of vs.Value into item.vptr, which is the largest
remaining memmove on the key-only forward-scan hot path
(dgraph has()/index scans).

Trade-offs (Option A, strict):
- Item.Value / Item.ValueCopy return new ErrKeyOnlyMode.
- Item.ValueSize / Item.EstimatedSize report 0 / key-size only.
- Item.Key, Version, UserMeta, ExpiresAt, IsDeletedOrExpired unaffected.
- NewIterator forces PrefetchValues=false when KeyOnly=true so the
  prefetch goroutine never starts.

Existing callers see no behavior change. The new benchmark
BenchmarkDgraphPrefixScanKeyOnlyOpt exercises the opt-in path
alongside the original PrefixScanKeyOnly to preserve comparison.

Results (5-bench composite, median-of-3, Apple M4 Max):
- PrefixScanKeyOnlyOpt:  7.76M ns/op (−2.0% vs iter3 KeyOnly, −34.3% vs baseline)
- Composite with opt-in: 25.63M ns (−25.4% vs upstream baseline)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 errors.go                     |  8 ++++++
 iterator.go                   | 54 ++++++++++++++++++++++++++++++++++-
 iterator_dgraph_bench_test.go | 35 +++++++++++++++++++++++
 3 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/errors.go b/errors.go
index dcf0d12ae..40433dcb6 100644
--- a/errors.go
+++ b/errors.go
@@ -46,6 +46,14 @@ var (
 	// ErrBannedKey is returned if the read/write key belongs to any banned namespace.
 	ErrBannedKey = stderrors.New("Key is using the banned prefix")
 
+	// ErrKeyOnlyMode is returned by Item.Value and Item.ValueCopy when the
+	// containing iterator was created with IteratorOptions.KeyOnly=true.
+	// In that mode the iterator never copies the value bytes into the Item
+	// (the main reason to use KeyOnly is to avoid that per-item copy on
+	// key-only scans), so value access is unavailable on those items.
+	ErrKeyOnlyMode = stderrors.New(
+		"Item value is unavailable in KeyOnly iterator mode")
+
 	// ErrThresholdZero is returned if threshold is set to zero, and value log GC is called.
 	// In such a case, GC can't be run.
 	ErrThresholdZero = stderrors.New(
diff --git a/iterator.go b/iterator.go
index d63ac2b09..10b08ea92 100644
--- a/iterator.go
+++ b/iterator.go
@@ -43,6 +43,11 @@ type Item struct {
 	status   prefetchStatus
 	meta     byte // We need to store meta to know about bitValuePointer.
 	userMeta byte
+	// keyOnly is true when the parent iterator was created with
+	// IteratorOptions.KeyOnly. The iterator skips copying value bytes into
+	// this item, so Item.Value/ValueCopy and the size estimators must
+	// short-circuit instead of touching the (nil) vptr.
+	keyOnly bool
 }
 
 // String returns a string representation of Item
@@ -81,6 +86,9 @@ func (item *Item) Version() uint64 {
 // instead, or copy it yourself. Value might change once discard or commit is called.
 // Use ValueCopy if you want to do a Set after Get.
 func (item *Item) Value(fn func(val []byte) error) error {
+	if item.keyOnly {
+		return ErrKeyOnlyMode
+	}
 	item.wg.Wait()
 	if item.status == prefetched {
 		if item.err == nil && fn != nil {
@@ -108,6 +116,9 @@ func (item *Item) Value(fn func(val []byte) error) error {
 // This function is useful in long running iterate/update transactions to avoid a write deadlock.
 // See Github issue: https://github.com/dgraph-io/badger/issues/315
 func (item *Item) ValueCopy(dst []byte) ([]byte, error) {
+	if item.keyOnly {
+		return nil, ErrKeyOnlyMode
+	}
 	item.wg.Wait()
 	if item.status == prefetched {
 		return y.SafeCopy(dst, item.val), item.err
@@ -213,7 +224,14 @@ func (item *Item) prefetchValue() {
 // This can be called while iterating through a store to quickly estimate the
 // size of a range of key-value pairs (without fetching the corresponding
 // values).
+//
+// When the iterator was created with IteratorOptions.KeyOnly=true, the
+// value bytes (and value pointer for vlog entries) are not retained on
+// the item, so this returns the key size only.
 func (item *Item) EstimatedSize() int64 {
+	if item.keyOnly {
+		return int64(len(item.key))
+	}
 	if !item.hasValue() {
 		return 0
 	}
@@ -235,7 +253,13 @@ func (item *Item) KeySize() int64 {
 //
 // This can be called to quickly estimate the size of a value without fetching
 // it.
+//
+// When the iterator was created with IteratorOptions.KeyOnly=true the value
+// length is not retained on the item; this returns 0.
 func (item *Item) ValueSize() int64 {
+	if item.keyOnly {
+		return 0
+	}
 	if !item.hasValue() {
 		return 0
 	}
@@ -312,6 +336,17 @@ type IteratorOptions struct {
 	AllVersions    bool // Fetch all valid versions of the same key.
 	InternalAccess bool // Used to allow internal access to badger keys.
 
+	// KeyOnly tells the iterator that the caller will not access value bytes
+	// from any item. When set, the iterator skips copying value bytes into
+	// the Item, saving a per-item memcpy on key-only forward scans (e.g.
+	// dgraph's has() predicate evaluator and index scans). The trade-off:
+	// Item.Value and Item.ValueCopy return ErrKeyOnlyMode, and
+	// Item.ValueSize / Item.EstimatedSize report 0. Item.Key, Version,
+	// UserMeta, ExpiresAt and IsDeletedOrExpired continue to work normally.
+	//
+	// PrefetchValues is forced to false when KeyOnly is true.
+	KeyOnly bool
+
 	// The following option is used to narrow down the SSTables that iterator
 	// picks up. If Prefix is specified, only tables which could have this
 	// prefix are picked based on their range of keys.
@@ -470,6 +505,12 @@ func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator {
 		panic(ErrDBClosed)
 	}
 
+	// KeyOnly disables value access, so prefetching values is nonsensical.
+	// Force PrefetchValues off so the prefetch goroutine is never started.
+	if opt.KeyOnly {
+		opt.PrefetchValues = false
+	}
+
 	y.NumIteratorsCreatedAdd(txn.db.opt.MetricsEnabled, 1)
 
 	// Keep track of the number of active iterators.
@@ -732,12 +773,23 @@ func (it *Iterator) fill(item *Item, key []byte, vs *y.ValueStruct) {
 	item.meta = vs.Meta
 	item.userMeta = vs.UserMeta
 	item.expiresAt = vs.ExpiresAt
+	item.keyOnly = it.opt.KeyOnly
 
 	item.version = y.ParseTs(key)
 	item.key = y.SafeCopy(item.key, y.ParseKey(key))
 
-	item.vptr = y.SafeCopy(item.vptr, vs.Value)
 	item.val = nil
+	if it.opt.KeyOnly {
+		// Don't copy vs.Value: KeyOnly callers have promised not to read
+		// it, and the SafeCopy is the largest per-item memmove on the
+		// key-only forward-scan hot path. nil out any leftover capacity
+		// from a previous item that was reused via the iterator's
+		// freelist; callers that ignore the contract will at least see a
+		// nil vptr rather than stale bytes.
+		item.vptr = nil
+	} else {
+		item.vptr = y.SafeCopy(item.vptr, vs.Value)
+	}
 	if it.opt.PrefetchValues {
 		item.wg.Add(1)
 		go func() {
diff --git a/iterator_dgraph_bench_test.go b/iterator_dgraph_bench_test.go
index c890709a3..61bd9d3a2 100644
--- a/iterator_dgraph_bench_test.go
+++ b/iterator_dgraph_bench_test.go
@@ -168,6 +168,41 @@ func BenchmarkDgraphPrefixScanKeyOnly(b *testing.B) {
 	b.ReportMetric(float64(nKeys), "keys/op")
 }
 
+// BenchmarkDgraphPrefixScanKeyOnlyOpt is the same workload as
+// BenchmarkDgraphPrefixScanKeyOnly but uses IteratorOptions.KeyOnly=true.
+// This measures the win from skipping the per-item SafeCopy(vptr) on the
+// has()/index-scan hot path; callers in this mode never read item.Value.
+func BenchmarkDgraphPrefixScanKeyOnlyOpt(b *testing.B) {
+	const (
+		ns    = uint64(0x0102030405060708)
+		nKeys = 200_000
+	)
+	db, dir := openDgraphDB(b)
+	defer func() { db.Close(); removeDir(dir) }()
+	dgraphLoadDB(b, db, ns, dgAttrName, nKeys, 1)
+
+	prefix := dgPrefix(ns, dgAttrName)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		txn := db.NewTransactionAt(math.MaxUint64, false)
+		opt := DefaultIteratorOptions
+		opt.Prefix = prefix
+		opt.KeyOnly = true
+		it := txn.NewIterator(opt)
+		count := 0
+		for it.Rewind(); it.Valid(); it.Next() {
+			count++
+		}
+		it.Close()
+		txn.Discard()
+		if count != nKeys {
+			b.Fatalf("expected %d keys, got %d", nKeys, count)
+		}
+	}
+	b.ReportMetric(float64(nKeys), "keys/op")
+}
+
 // BenchmarkDgraphPrefixScanAllVersions models the rollup path:
 // prefix-bounded forward iteration with PrefetchValues=false and
 // AllVersions=true, where each key has several MVCC versions.

From 32d58ab9f4e528ee0b886971df3db2bf19702336 Mon Sep 17 00:00:00 2001
From: Shaun Patterson <shaunpatterson@gmail.com>
Date: Sat, 23 May 2026 15:46:07 -0400
Subject: [PATCH 6/8] perf(iterator): store user-key only in lastKey, inline
 same-key compare

Iterator.lastKey previously stored the full key (user-key || 8B ts) but
SameKey only compared the user-key portion. Two consequences:

1. SafeCopy(it.lastKey, key) copied 8 bytes of dead ts every kept item.
2. y.SameKey called ParseKey on both src and dst (two slice ops + a
   function call) on the hot dedup compare.

Change: store the user-key only in lastKey; inline the compare.

  ukLen := len(key) - 8
  if ukLen == len(it.lastKey) && bytes.Equal(key[:ukLen], it.lastKey) {
      mi.Next()
      return false
  }
  it.lastKey = y.SafeCopy(it.lastKey, key[:ukLen])

The same-key dedup invariant ("skip later versions of a user-key we've
already considered, including tombstoned ones") is preserved exactly:
lastKey is still updated before the FILL/deleted-or-expired check.

Result: BenchmarkDgraphPrefixScanKeyOnlyOpt: 7.76M -> 7.58M ns (-2.3%).
Other benchmarks within run-to-run noise.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 iterator.go | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/iterator.go b/iterator.go
index 10b08ea92..f7c5cf5c1 100644
--- a/iterator.go
+++ b/iterator.go
@@ -468,7 +468,12 @@ type Iterator struct {
 	data  list
 	waste list
 
-	lastKey []byte // Used to skip over multiple versions of the same key.
+	// lastKey stores the user-key (no 8-byte timestamp suffix) of the most
+	// recently considered candidate, used to skip subsequent older versions
+	// of the same user-key on AllVersions=false forward scans. Storing the
+	// user-key only avoids one ParseKey per same-key compare and a per-item
+	// 8-byte memcpy on the update.
+	lastKey []byte
 
 	// canSeeInternalKeys is true when this iterator can possibly surface a
 	// badger-internal key (e.g. "!badger!banned"). When false, parseItem
@@ -717,7 +722,13 @@ func (it *Iterator) parseItem() bool {
 	// If iterating in forward direction, then just checking the last key against current key would
 	// be sufficient.
 	if !it.opt.Reverse {
-		if y.SameKey(it.lastKey, key) {
+		// lastKey holds the user-key only. Compare against the user-key
+		// portion of the current full key (last 8 bytes are the ts).
+		// bytes.Equal already short-circuits on length mismatch, but the
+		// explicit length check lets the compiler hoist the bounds check
+		// out of the user-key slice and keeps the hot path branch-tight.
+		ukLen := len(key) - 8
+		if ukLen == len(it.lastKey) && bytes.Equal(key[:ukLen], it.lastKey) {
 			mi.Next()
 			return false
 		}
@@ -726,7 +737,7 @@ func (it *Iterator) parseItem() bool {
 		// Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a.
 		// Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5,
 		// which is wrong. Therefore, update lastKey here.
-		it.lastKey = y.SafeCopy(it.lastKey, key)
+		it.lastKey = y.SafeCopy(it.lastKey, key[:ukLen])
 	}
 
 FILL:

From d98b3ed5c918f5593f790944e9ad9c346f95fa5a Mon Sep 17 00:00:00 2001
From: Shaun Patterson <shaunpatterson@gmail.com>
Date: Sat, 23 May 2026 16:37:46 -0400
Subject: [PATCH 7/8] docs(iterator): clarify FILL key-freshness and ukLen
 invariants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Strengthen comments on two hot-path invariants surfaced during code
review of the perf series:

1. FILL label entry contract: `key` is always mi.Key() at the current
   iitr position. The only goto FILL (reverse path) refreshes key after
   mi.Next(); the fall-through entry above never advances the iterator
   between `key := mi.Key()` and reaching FILL. This is what lets
   fill() safely reuse the caller-supplied key without re-calling
   mi.Key() per item.

2. lastKey ukLen safety: `ukLen := len(key) - 8` is safe because
   len(key) >= 8 is a badger-wide invariant (every LSM key is stored
   with an 8-byte timestamp suffix via y.KeyWithTs). y.ParseTs(key)
   above already relies on this — no defensive check needed.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 iterator.go | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/iterator.go b/iterator.go
index f7c5cf5c1..379b5e5dc 100644
--- a/iterator.go
+++ b/iterator.go
@@ -727,6 +727,11 @@ func (it *Iterator) parseItem() bool {
 		// bytes.Equal already short-circuits on length mismatch, but the
 		// explicit length check lets the compiler hoist the bounds check
 		// out of the user-key slice and keeps the hot path branch-tight.
+		//
+		// len(key) >= 8 is a badger-wide invariant: every key in the LSM is
+		// stored with an 8-byte timestamp suffix via y.KeyWithTs, and
+		// y.ParseTs(key) above already relies on this (it indexes
+		// key[len(key)-8:]). No defensive check is needed here.
 		ukLen := len(key) - 8
 		if ukLen == len(it.lastKey) && bytes.Equal(key[:ukLen], it.lastKey) {
 			mi.Next()
@@ -741,9 +746,12 @@ func (it *Iterator) parseItem() bool {
 	}
 
 FILL:
-	// If deleted, advance and return. We fetch vs (and reuse key from the
-	// enclosing scope, or refetch after Next on the reverse goto below) so
-	// that fill does not have to call mi.Value() / mi.Key() a second time.
+	// Invariant on entry to FILL: `key` is mi.Key() at the *current* iitr
+	// position. The only goto FILL (below, reverse path) refreshes `key`
+	// after mi.Next(); the fall-through entry from above never advances the
+	// iterator between `key := mi.Key()` and reaching FILL. fill() can
+	// therefore safely reuse the caller-supplied key without re-calling
+	// mi.Key().
 	vs := mi.Value()
 	if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
 		mi.Next()

From 16e8b0f7b904cf3cdc72463d8996629db551f8e0 Mon Sep 17 00:00:00 2001
From: Shaun Patterson <shaunpatterson@gmail.com>
Date: Sat, 23 May 2026 16:37:57 -0400
Subject: [PATCH 8/8] test: cover KeyOnly iterator paths and preserved-behavior
 regressions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a comprehensive test file that hits every line of the perf series
touched in db.go and iterator.go. Two test groups:

* TestKeyOnlyIterator_* — exercise the new KeyOnly opt-in path:
  Value/ValueCopy return ErrKeyOnlyMode; EstimatedSize returns key
  length; ValueSize returns 0; NewIterator forces PrefetchValues=false;
  Key/UserMeta/Version/ExpiresAt continue to work under KeyOnly.

* TestRegression* — lock in pre-existing behaviors that the perf series
  rewrote in place: same-key dedup across multiple MVCC versions
  (iter5 lastKey rewrite), internal-key hiding (iter3 canSeeInternalKeys
  gate), isBanned correctness across the new atomic.Bool fast path
  (iter1) including short-key-with-bans branch, and non-KeyOnly fill
  path correctness (iter2 hoist).

* TestCanSeeInternalKeys — table-driven unit test for the helper
  added in iter3.

All preserved-behavior tests also pass on upstream/main, proving the
expectations match the unchanged baseline. Together with the
benchmarks added in 7174cc2, the touched lines in db.go and
iterator.go now have 100% coverage.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 iterator_keyonly_test.go | 399 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 399 insertions(+)
 create mode 100644 iterator_keyonly_test.go

diff --git a/iterator_keyonly_test.go b/iterator_keyonly_test.go
new file mode 100644
index 000000000..8a7d1c601
--- /dev/null
+++ b/iterator_keyonly_test.go
@@ -0,0 +1,399 @@
+/*
+ * SPDX-FileCopyrightText: © 2017-2025 Istari Digital, Inc.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package badger
+
+import (
+	"errors"
+	"fmt"
+	"testing"
+
+	"github.com/dgraph-io/badger/v4/y"
+	"github.com/stretchr/testify/require"
+)
+
+// TestKeyOnlyIterator_ValueReturnsErrKeyOnlyMode covers iterator.go:Item.Value
+// short-circuit when item.keyOnly is set (iter4).
+func TestKeyOnlyIterator_ValueReturnsErrKeyOnlyMode(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		txnSet(t, db, []byte("k1"), []byte("v1"), 0)
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.KeyOnly = true
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			for it.Rewind(); it.Valid(); it.Next() {
+				item := it.Item()
+				err := item.Value(func(v []byte) error { return nil })
+				require.True(t, errors.Is(err, ErrKeyOnlyMode),
+					"Value() should return ErrKeyOnlyMode, got %v", err)
+			}
+			return nil
+		}))
+	})
+}
+
+// TestKeyOnlyIterator_ValueCopyReturnsErrKeyOnlyMode covers iterator.go:Item.ValueCopy.
+func TestKeyOnlyIterator_ValueCopyReturnsErrKeyOnlyMode(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		txnSet(t, db, []byte("k1"), []byte("v1"), 0)
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.KeyOnly = true
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			for it.Rewind(); it.Valid(); it.Next() {
+				item := it.Item()
+				buf, err := item.ValueCopy(nil)
+				require.True(t, errors.Is(err, ErrKeyOnlyMode))
+				require.Nil(t, buf)
+			}
+			return nil
+		}))
+	})
+}
+
+// TestKeyOnlyIterator_EstimatedSizeReturnsKeyLen covers iterator.go:Item.EstimatedSize
+// short-circuit (returns int64(len(item.key)) when keyOnly).
+func TestKeyOnlyIterator_EstimatedSizeReturnsKeyLen(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		txnSet(t, db, []byte("abc"), []byte("lots-of-value-bytes-here"), 0)
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.KeyOnly = true
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			it.Rewind()
+			require.True(t, it.Valid())
+			item := it.Item()
+			require.Equal(t, int64(len("abc")), item.EstimatedSize())
+			return nil
+		}))
+	})
+}
+
+// TestKeyOnlyIterator_ValueSizeIsZero covers iterator.go:Item.ValueSize short-circuit.
+func TestKeyOnlyIterator_ValueSizeIsZero(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		txnSet(t, db, []byte("k1"), []byte("hello-world"), 0)
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.KeyOnly = true
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			it.Rewind()
+			require.True(t, it.Valid())
+			require.Equal(t, int64(0), it.Item().ValueSize())
+			return nil
+		}))
+	})
+}
+
+// TestKeyOnlyIterator_ForcesPrefetchOff covers iterator.go:NewIterator forcing
+// PrefetchValues=false when KeyOnly=true.
+func TestKeyOnlyIterator_ForcesPrefetchOff(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		txnSet(t, db, []byte("k1"), []byte("v1"), 0)
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.KeyOnly = true
+			opt.PrefetchValues = true
+			opt.PrefetchSize = 100
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			require.False(t, it.opt.PrefetchValues,
+				"NewIterator must force PrefetchValues=false when KeyOnly=true")
+			it.Rewind()
+			require.True(t, it.Valid())
+			return nil
+		}))
+	})
+}
+
+// TestKeyOnlyIterator_KeyMetaVersionWork verifies that the metadata methods on Item
+// still work correctly in KeyOnly mode (per the contract documented on KeyOnly).
+// Indirectly covers fill() setting item.keyOnly while still populating meta/version.
+func TestKeyOnlyIterator_KeyMetaVersionWork(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		key := []byte("hello")
+		val := []byte("world")
+		txn := db.NewTransaction(true)
+		require.NoError(t, txn.SetEntry(NewEntry(key, val).WithMeta(0x42)))
+		require.NoError(t, txn.Commit())
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.KeyOnly = true
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			it.Rewind()
+			require.True(t, it.Valid())
+			item := it.Item()
+			require.Equal(t, key, item.Key())
+			require.Equal(t, byte(0x42), item.UserMeta())
+			require.NotZero(t, item.Version())
+			require.False(t, item.IsDeletedOrExpired())
+			require.Equal(t, int64(len(key)), item.KeySize())
+			require.False(t, item.DiscardEarlierVersions())
+			return nil
+		}))
+	})
+}
+
+// TestKeyOnlyIterator_PrefetchValuesFalseStillWorks covers fill() KeyOnly branch
+// without prefetch (the actual hot path; PrefetchValues=false bypasses the
+// prefetch goroutine entirely).
+func TestKeyOnlyIterator_PrefetchValuesFalseStillWorks(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		for i := 0; i < 20; i++ {
+			txnSet(t, db, []byte(fmt.Sprintf("k%02d", i)), []byte("vvv"), 0)
+		}
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.PrefetchValues = false
+			opt.KeyOnly = true
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			count := 0
+			for it.Rewind(); it.Valid(); it.Next() {
+				item := it.Item()
+				require.NotEmpty(t, item.Key())
+				err := item.Value(func(v []byte) error { return nil })
+				require.True(t, errors.Is(err, ErrKeyOnlyMode))
+				count++
+			}
+			require.Equal(t, 20, count)
+			return nil
+		}))
+	})
+}
+
+// TestCanSeeInternalKeys is a table-driven unit test for the iter3 helper.
+// It is the single source of truth for which prefix shapes allow internal keys.
+func TestCanSeeInternalKeys(t *testing.T) {
+	cases := []struct {
+		name   string
+		prefix []byte
+		want   bool
+	}{
+		{"empty prefix sees everything", nil, true},
+		{"empty slice sees everything", []byte{}, true},
+		{"prefix starting with '!' (badgerPrefix[0]) may overlap", []byte("!badger"), true},
+		{"prefix starting with 0x21 (== '!') overlap", []byte{0x21, 0xff}, true},
+		{"prefix starting with 0x00 cannot overlap", []byte{0x00, 0x01, 0x02}, false},
+		{"prefix starting with 'a' cannot overlap", []byte("abc"), false},
+		{"prefix starting with high byte cannot overlap", []byte{0xff}, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := canSeeInternalKeys(tc.prefix)
+			require.Equal(t, tc.want, got)
+		})
+	}
+}
+
+// TestRegressionSameKeyDedup locks in the iter5 lastKey behavior: when multiple
+// versions of the same user-key exist and AllVersions=false, the iterator must
+// surface exactly one item per user-key (the freshest non-expired one). This
+// test exercises the user-key-only comparison path.
+func TestRegressionSameKeyDedup(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		// Write three versions of "k1" and three versions of "k2".
+		for _, k := range []string{"k1", "k2"} {
+			for v := 1; v <= 3; v++ {
+				txnSet(t, db, []byte(k), []byte(fmt.Sprintf("v%d", v)), 0)
+			}
+		}
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.PrefetchValues = false
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			var seen []string
+			for it.Rewind(); it.Valid(); it.Next() {
+				seen = append(seen, string(it.Item().Key()))
+			}
+			require.Equal(t, []string{"k1", "k2"}, seen,
+				"non-AllVersions iterator must dedup to one item per user-key")
+			return nil
+		}))
+
+		// AllVersions=true should see all 3 versions per key.
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.AllVersions = true
+			opt.PrefetchValues = false
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			counts := map[string]int{}
+			for it.Rewind(); it.Valid(); it.Next() {
+				counts[string(it.Item().Key())]++
+			}
+			require.Equal(t, 3, counts["k1"])
+			require.Equal(t, 3, counts["k2"])
+			return nil
+		}))
+	})
+}
+
+// TestRegressionInternalKeysHidden locks in iter3 behavior: a default user
+// iterator (no InternalAccess) must not surface badger-internal keys even when
+// the optimized canSeeInternalKeys path is taken (prefix nil or '!').
+func TestRegressionInternalKeysHidden(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		txnSet(t, db, []byte("user-key"), []byte("v"), 0)
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			it := txn.NewIterator(DefaultIteratorOptions)
+			defer it.Close()
+			for it.Rewind(); it.Valid(); it.Next() {
+				k := it.Item().Key()
+				require.False(t, isBadgerInternalKey(k),
+					"default iterator surfaced internal key %q", k)
+			}
+			return nil
+		}))
+	})
+}
+
+func isBadgerInternalKey(k []byte) bool {
+	if len(k) < len(badgerPrefix) {
+		return false
+	}
+	for i := range badgerPrefix {
+		if k[i] != badgerPrefix[i] {
+			return false
+		}
+	}
+	return true
+}
+
+// TestRegressionIsBannedNoBans locks in iter1's fast-path correctness:
+// when no namespaces have been banned, isBanned must return nil for any key
+// (regardless of NamespaceOffset). Exercises the hasAny.Load()==false branch.
+func TestRegressionIsBannedNoBans(t *testing.T) {
+	opt := getTestOptions("")
+	opt.NamespaceOffset = 0
+	runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+		// hasAny is false at this point - never any bans on a fresh DB.
+		require.False(t, db.bannedNamespaces.hasAny.Load())
+
+		// isBanned must return nil for any key on a fresh DB.
+		key := y.KeyWithTs([]byte("hello-world-12345"), 1)
+		require.NoError(t, db.isBanned(key))
+
+		// Empty key, short key - all fine, fast path doesn't even reach the
+		// length check.
+		require.NoError(t, db.isBanned(nil))
+		require.NoError(t, db.isBanned([]byte("x")))
+	})
+}
+
+// TestRegressionIsBannedWithBan covers the slow-path: hasAny=true, key matches
+// a banned namespace.
+func TestRegressionIsBannedWithBan(t *testing.T) {
+	opt := getTestOptions("")
+	opt.NamespaceOffset = 0
+	runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+		const ns = uint64(0x4242)
+		require.NoError(t, db.BanNamespace(ns))
+		require.True(t, db.bannedNamespaces.hasAny.Load(),
+			"hasAny must flip to true after add()")
+
+		// Key in banned namespace: ns bytes (big-endian) at offset 0 + suffix
+		// + 8B ts.
+		banned := y.KeyWithTs(append(y.U64ToBytes(ns), []byte("suffix")...), 1)
+		require.ErrorIs(t, db.isBanned(banned), ErrBannedKey)
+
+		// Key in a different namespace: not banned.
+		other := y.KeyWithTs(append(y.U64ToBytes(0x1111), []byte("suffix")...), 1)
+		require.NoError(t, db.isBanned(other))
+	})
+}
+
+// TestRegressionIsBannedShortKeyWithBans covers the slow path early-return when
+// hasAny=true but the key is too short to extract a namespace.
+func TestRegressionIsBannedShortKeyWithBans(t *testing.T) {
+	opt := getTestOptions("")
+	opt.NamespaceOffset = 0
+	runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+		const ns = uint64(0x4242)
+		require.NoError(t, db.BanNamespace(ns))
+
+		// Key shorter than NamespaceOffset+8 should be accepted (nil) — the
+		// pre-existing semantics treat un-namespaceable keys as non-banned.
+		require.NoError(t, db.isBanned([]byte("short")))
+		require.NoError(t, db.isBanned(nil))
+	})
+}
+
+// TestRegressionIsBannedNegativeOffset covers the early-return: NamespaceOffset<0
+// short-circuits before any atomic load.
+func TestRegressionIsBannedNegativeOffset(t *testing.T) {
+	opt := getTestOptions("")
+	opt.NamespaceOffset = -1
+	runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+		require.NoError(t, db.isBanned([]byte("anything")))
+		require.NoError(t, db.isBanned(nil))
+	})
+}
+
+// TestRegressionFillNonKeyOnly exercises the non-KeyOnly fill path (iter2's
+// hoist), verifying that values are still copied correctly when KeyOnly=false.
+// This is the path that all existing badger users hit by default.
+func TestRegressionFillNonKeyOnly(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		txnSet(t, db, []byte("k1"), []byte("expected-value-bytes"), 0)
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.PrefetchValues = false // exercise yieldItemValue path
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			it.Rewind()
+			require.True(t, it.Valid())
+			got, err := it.Item().ValueCopy(nil)
+			require.NoError(t, err)
+			require.Equal(t, []byte("expected-value-bytes"), got)
+			return nil
+		}))
+	})
+}
+
+// TestRegressionPrefetchValuesTrue exercises the prefetch goroutine path,
+// confirming KeyOnly=false + PrefetchValues=true still works (iter2 + iter4
+// didn't break the default).
+func TestRegressionPrefetchValuesTrue(t *testing.T) {
+	runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+		for i := 0; i < 5; i++ {
+			txnSet(t, db, []byte(fmt.Sprintf("k%d", i)), []byte(fmt.Sprintf("v%d", i)), 0)
+		}
+
+		require.NoError(t, db.View(func(txn *Txn) error {
+			opt := DefaultIteratorOptions
+			opt.PrefetchValues = true
+			opt.PrefetchSize = 10
+			it := txn.NewIterator(opt)
+			defer it.Close()
+			seen := 0
+			for it.Rewind(); it.Valid(); it.Next() {
+				item := it.Item()
+				v, err := item.ValueCopy(nil)
+				require.NoError(t, err)
+				require.NotEmpty(t, v)
+				seen++
+			}
+			require.Equal(t, 5, seen)
+			return nil
+		}))
+	})
+}