From 9a25ee793e1631af618e68c4546463662215eea2 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Wed, 21 Jan 2026 09:04:22 -0800 Subject: [PATCH] sstable: push filter check inside cache Previously, every bloom filter check in `SeekPrefixGE` required: 1. Getting the filter block from cache (atomic refcount increment) 2. Calling `MayContain` on the filter data (a very fast operation) 3. Releasing the block (atomic refcount decrement) 4. Updating hit counter (atomic increment) For a simple bloom filter check (a few hash operations), the atomic refcount operations represent significant overhead (especially when there are a small number of very hot filters in the upper LSM levels). Add a new `cache.Handle.TableFilterMayContain` method that performs the filter check while holding the cache's read lock, without incrementing/decrementing the refcount. The value is safe to access while holding the read lock since eviction requires the write lock. When the filter block is not in cache, the code falls back to the existing `readFilterBlock` path which handles reading from disk and populating the cache. We no longer record these accesses as cache hits for statistics purposes. We still record misses, and since we now separate statistics by block type (and level), we'll still know what percentage of overall misses are due to filter blocks. --- internal/cache/cache.go | 22 ++++++++++++++++++++++ internal/cache/clockpro.go | 28 ++++++++++++++++++++++++++++ sstable/block/block.go | 5 +++++ sstable/filter.go | 5 +++++ sstable/reader_iter_single_lvl.go | 15 +++++++++++++++ 5 files changed, 75 insertions(+) diff --git a/internal/cache/cache.go b/internal/cache/cache.go index bb5288efcf9..9d607404bb2 100644 --- a/internal/cache/cache.go +++ b/internal/cache/cache.go @@ -271,6 +271,28 @@ func (c *Handle) Get( return c.cache.getShard(k).get(k, level, category, false /* peekOnly */) } +// TableFilterMayContain looks up a cached filter block and checks if it may contain the +// given key, while holding the cache's read lock. This avoids the refcount +// overhead of Get/Release for the common case of bloom filter checks. +// +// TableFilterMayContain does not update hit counters. +// +// The dataOffset specifies how many bytes to skip at the start of the cached +// buffer (typically block.MetadataSize to skip block metadata). +// +// Returns (true, mayContain) if found; (false, false) if not in cache. +// When not found, the caller should fall back to reading from disk. +func (c *Handle) TableFilterMayContain( + fileNum base.DiskFileNum, + offset uint64, + dataOffset int, + filter base.TableFilterDecoder, + filterKey []byte, +) (found bool, mayContain bool) { + k := makeKey(c.id, fileNum, offset) + return c.cache.getShard(k).tableFilterMayContain(k, dataOffset, filter, filterKey) +} + // GetWithReadHandle retrieves the cache value for the specified handleID, fileNum // and offset. If found, a valid Handle is returned (with cacheHit set to // true), else a valid ReadHandle is returned. diff --git a/internal/cache/clockpro.go b/internal/cache/clockpro.go index dfda772a9f6..8bb46e54d38 100644 --- a/internal/cache/clockpro.go +++ b/internal/cache/clockpro.go @@ -184,6 +184,34 @@ func (c *shard) getWithReadEntry(k key, level base.Level, category Category) (*V return nil, re } +// tableFilterMayContain looks up a cache entry and checks if the filter may +// contain the given key, while holding the read lock. This avoids refcount +// overhead for bloom filter checks. +// +// The dataOffset specifies how many bytes to skip at the start of the cached +// buffer (to skip block metadata). +// +// tableFilterMayContain does not update hit counters. +// +// Returns (true, mayContain) if found; (false, false) otherwise. +func (c *shard) tableFilterMayContain( + k key, dataOffset int, filter base.TableFilterDecoder, filterKey []byte, +) (found bool, mayContain bool) { + c.mu.RLock() + defer c.mu.RUnlock() + + if e, _ := c.blocks.Get(k); e != nil { + if v := e.val; v != nil && len(v.buf) > dataOffset { + // Update referenced flag for CLOCK-Pro (same as regular get). + if !e.referenced.Load() { + e.referenced.Store(true) + } + return true, filter.MayContain(v.buf[dataOffset:], filterKey) + } + } + return false, false +} + func (c *shard) set(k key, value *Value, markAccessed bool) { c.mu.Lock() defer c.mu.Unlock() diff --git a/sstable/block/block.go b/sstable/block/block.go index 4664e360681..38d30dcffd9 100644 --- a/sstable/block/block.go +++ b/sstable/block/block.go @@ -375,6 +375,11 @@ func (r *Reader) ChecksumType() ChecksumType { return r.checksumType } +// CacheHandle returns the cache Handle, or nil if caching is disabled. +func (r *Reader) CacheHandle() *cache.Handle { + return r.opts.CacheOpts.CacheHandle +} + var kindToCacheCategory = [blockkind.NumKinds]cache.Category{ blockkind.Unknown: cache.CategoryBackground, blockkind.SSTableData: cache.CategorySSTableData, diff --git a/sstable/filter.go b/sstable/filter.go index c330253dcae..2a66a3a80bd 100644 --- a/sstable/filter.go +++ b/sstable/filter.go @@ -65,3 +65,8 @@ func (f *tableFilterReader) mayContain(data, key []byte) bool { } return mayContain } + +// Decoder returns the underlying filter decoder. +func (f *tableFilterReader) Decoder() base.TableFilterDecoder { + return f.decoder +} diff --git a/sstable/reader_iter_single_lvl.go b/sstable/reader_iter_single_lvl.go index 8bf415ce9dd..5ce7952c503 100644 --- a/sstable/reader_iter_single_lvl.go +++ b/sstable/reader_iter_single_lvl.go @@ -1027,6 +1027,21 @@ func (i *singleLevelIterator[I, PI, D, PD]) bloomFilterMayContain(prefix []byte) } } + // Fast path: check directly in cache without refcount overhead. + if cacheHandle := i.reader.blockReader.CacheHandle(); cacheHandle != nil { + found, mayContain := cacheHandle.TableFilterMayContain( + i.reader.blockReader.FileNum(), + i.reader.filterBH.Offset, + block.MetadataSize, + i.reader.tableFilter.Decoder(), + prefixToCheck, + ) + if found { + return mayContain, nil + } + } + + // Slow path: read filter block from disk. dataH, err := i.reader.readFilterBlock(i.ctx, i.readEnv.Block, i.indexFilterRH, i.reader.filterBH) if err != nil { return false, err