diff --git a/level_iter.go b/level_iter.go index 705be0adb91..16b2b719a00 100644 --- a/level_iter.go +++ b/level_iter.go @@ -703,6 +703,71 @@ func (l *levelIter) internalSeekGE( return l.verify(kv), kvMeta } +// SeekPrefixGE implements InternalIterator.SeekPrefixGE. It positions the +// iterator at the first key greater than or equal to key across all files in +// the level. It returns the key-value pair at that position, or nil if no such +// key exists. +// +// The prefix argument is passed to each file's iterator for bloom filter +// checking. If a file's bloom filter indicates that prefix is not present, that +// file is skipped, and the iterator moves to the next file if and only if the +// next file also can contain the prefix. The key argument is used for the +// actual seek positioning. +// +// # Prefix vs key +// +// The prefix is typically the prefix of key (i.e. Split.Prefix(key) == prefix), +// but this is not required. The only requirement is that prefix be less than or +// equal to Split.Prefix(key). This flexibility is used when a higher-level +// range deletion invalidates keys between prefix and Split.Prefix(key). +// +// For example, consider a SeekPrefixGE for prefix b and assume a RANGEDEL +// [a, c@3) from a higher level. In this case, the merging iterator must seek +// past the tombstone so it calls SeekPrefixGE(b, c@3, flags). The bloom filter +// is checked against b (the original prefix), while the actual seek targets +// c@3. +// +// # File positioning and TrySeekUsingNext +// +// The prefix is stored and controls file advancement when the current file is +// exhausted. If the current file's largest key has a prefix greater than the +// seek prefix, the iterator stops rather than advancing to the next file. +// +// This stopping condition is critical for TrySeekUsingNext correctness. Without +// it, the following scenario would produce incorrect results: +// +// 1. SeekPrefixGE(P1, key1) - bloom filter misses on file F, no key found +// 2. Iterator advances to file G (incorrectly, if G's smallest prefix > P1) +// 3. SeekPrefixGE(P2, key2, TrySeekUsingNext) where P1 < P2 < G's smallest prefix +// 4. TrySeekUsingNext starts from G, completely skipping file F which may +// contain keys with prefix P2 +// +// The stopping condition prevents step 2: if F's largest prefix > P1, the +// iterator remains at F, allowing the subsequent seek for P2 to correctly +// examine F. +// +// # Return values and iterator state +// +// - If a key is found: returns the key-value pair and positions the iterator +// at that key. The iterator may be positioned in any file that contains +// matching keys. +// +// - If no key is found: returns nil. The iterator is exhausted in the forward +// direction. Subsequent Next() calls will return nil. +// +// # TrySeekUsingNext optimization +// +// The TrySeekUsingNext flag in flags indicates that the caller knows the seek +// key is greater than or equal to the iterator's current position. When set: +// +// - At the file level: the iterator may scan forward through files rather +// than performing a binary search through the file metadata. +// +// - At the sstable level: the optimization is automatically disabled when a +// new file is loaded, since the new file's iterator is not yet positioned. +// +// Note: The caller must ensure that key is greater than or equal to the lower +// bound. SeekPrefixGE checks the upper bound but not the lower bound. func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) (kv *base.InternalKV) { if treesteps.Enabled && treesteps.IsRecording(l) { op := treesteps.StartOpf(l, "SeekPrefixGE(%q, %q, %d)", prefix, key, flags) diff --git a/sstable/reader_iter_single_lvl.go b/sstable/reader_iter_single_lvl.go index 8bf415ce9dd..0742cd3db29 100644 --- a/sstable/reader_iter_single_lvl.go +++ b/sstable/reader_iter_single_lvl.go @@ -839,9 +839,45 @@ func (i *singleLevelIterator[I, PI, D, PD]) seekGEHelper( return i.skipForward(shouldReturnMeta) } -// SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the -// pebble package. Note that SeekPrefixGE only checks the upper bound. It is up -// to the caller to ensure that key is greater than or equal to the lower bound. +// SeekPrefixGE implements InternalIterator.SeekPrefixGE. It positions the +// iterator at the first key greater than or equal to key, as long as that key +// has a prefix matching prefix. It returns the key-value pair at that position, +// or nil if no such key exists in the table. +// +// The prefix argument is used exclusively for bloom filter checking. If the +// table has a bloom filter and the filter indicates that prefix is not present, +// SeekPrefixGE returns nil without positioning the iterator. The key argument +// is used for the actual seek positioning when the bloom filter check passes or +// is not applicable. +// +// The prefix is typically the prefix of key (i.e. Split.Prefix(key) == prefix), +// but this is not required. The only requirement is that prefix be less than or +// equal to Split.Prefix(key). This flexibility is used by higher-layer +// iterators. +// +// Return values and iterator state: +// +// - If a key is found: returns the key-value pair and positions the iterator +// at that key. Subsequent Next() calls will return following keys. +// +// - If no key is found because the bloom filter excluded the prefix: returns +// nil. The iterator is not positioned; calling Next() or Prev() after this +// is not permitted. +// +// - If no key is found but the bloom filter matched (or was not used): returns +// nil. The iterator may be exhausted (reached bounds or end of table). +// Calling Next() when exhausted forward will panic. +// +// The TrySeekUsingNext flag in flags indicates that the caller knows the seek +// key is greater than or equal to the iterator's current position. When set, +// the iterator may optimize by scanning forward from its current position +// rather than performing a full seek. This optimization is automatically +// disabled (i.e. SeekPrefixGE ignores the TrySeekUsingNext flag) when the +// previous SeekPrefixGE returned nil due to a bloom filter miss, since the +// iterator was not repositioned in that case. +// +// Note: SeekPrefixGE only checks the upper bound. It is up to the caller to +// ensure that key is greater than or equal to the lower bound. func (i *singleLevelIterator[I, PI, D, PD]) SeekPrefixGE( prefix, key []byte, flags base.SeekGEFlags, ) (kv *base.InternalKV) {