perf: avoid materializing RoaringBitmap::full() in fragment allow-list (#6664)

wkalt · claude · web-flow · commit 23003a7a02bd · 2026-05-02T13:35:54.000-07:00
The fragment-bitmap allow-list filter dominates the cost of merge_insert and vector-search prefilter on tables that have received writes since their index was built. The filter ANDs an AllowList of full fragments with the deletion BlockList. RowAddrMask::bitand expands AllowList & BlockList into AllowList - BlockList, and the per-fragment (Full - Partial) branch in RowAddrTreeMap::sub_assign materializes RoaringBitmap::full() for every fragment with deletions. Build the equivalent BlockList directly (deletions union rows in fragments outside the index bitmap) using only Full markers and a HashMap clone. Existing #6563 stale-index tests cover correctness. --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
@@ -158,6 +158,10 @@ required-features = ["cli"]
 name = "scalar_index"
 harness = false
 
+[[bench]]
+name = "merge_insert"
+harness = false
+
 [[bench]]
 name = "scan"
 harness = false
diff --git a/rust/lance/benches/merge_insert.rs b/rust/lance/benches/merge_insert.rs
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+//
+//! Benchmarks for `merge_insert` with a scalar index on the merge key.
+//!
+//! The interesting case is the dataset shape that exercises the fragment-
+//! bitmap allow-list path in `DatasetPreFilter::create_restricted_deletion_mask`:
+//!
+//! - some fragments live OUTSIDE the index's `fragment_bitmap`
+//!   (i.e. data was appended after the index was built, or partially
+//!   rewritten without re-indexing), AND
+//! - some fragments INSIDE the bitmap have a deletion file.
+//!
+//! Both conditions together force the slow `AllowList(Full) - BlockList(Partial)`
+//! computation. The other shapes (`clean`, `with_new_rows_only`,
+//! `with_deletions_only`) skip that branch and serve as controls.
+//!
+//! Run with `cargo bench --bench merge_insert`.
+
+use std::sync::Arc;
+
+use arrow_array::{Int64Array, RecordBatch, RecordBatchIterator};
+use arrow_schema::{DataType, Field, Schema as ArrowSchema};
+use criterion::{Criterion, criterion_group, criterion_main};
+use lance::dataset::write::merge_insert::{MergeInsertBuilder, WhenMatched, WhenNotMatched};
+use lance::dataset::{Dataset, WriteMode, WriteParams};
+use lance::index::DatasetIndexExt;
+use lance_core::utils::tempfile::TempStrDir;
+use lance_index::IndexType;
+use lance_index::scalar::ScalarIndexParams;
+#[cfg(target_os = "linux")]
+use pprof::criterion::{Output, PProfProfiler};
+
+// Many small fragments to amplify the slow path: each indexed fragment with
+// a deletion file produces one RoaringBitmap::full() allocation per
+// merge_insert call. Cost scales linearly with NUM_FRAGS.
+const ROWS_PER_FRAG: u64 = 100;
+const NUM_FRAGS: u64 = 200;
+// Minimal schema so the merge_insert baseline (sort, hash-join, write) is small
+// and the prefilter overhead dominates the measurement.
+fn schema() -> Arc<ArrowSchema> {
+    Arc::new(ArrowSchema::new(vec![Field::new(
+        "id",
+        DataType::Int64,
+        false,
+    )]))
+}
+
+fn make_batch(start_id: i64, n: usize) -> RecordBatch {
+    let ids = Int64Array::from_iter_values(start_id..start_id + n as i64);
+    RecordBatch::try_new(schema(), vec![Arc::new(ids)]).unwrap()
+}
+
+fn make_batches(start_id: i64, total_rows: u64) -> Vec<RecordBatch> {
+    let mut out = Vec::new();
+    let mut remaining = total_rows;
+    let mut next_start = start_id;
+    while remaining > 0 {
+        let n = remaining.min(ROWS_PER_FRAG) as usize;
+        out.push(make_batch(next_start, n));
+        next_start += n as i64;
+        remaining -= n as u64;
+    }
+    out
+}
+
+async fn build_indexed_base(path: &str) -> Dataset {
+    let total = ROWS_PER_FRAG * NUM_FRAGS;
+    let params = WriteParams {
+        max_rows_per_file: ROWS_PER_FRAG as usize,
+        max_rows_per_group: ROWS_PER_FRAG as usize,
+        mode: WriteMode::Create,
+        ..Default::default()
+    };
+    let batches = make_batches(0, total);
+    let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema());
+    Dataset::write(reader, path, Some(params)).await.unwrap();
+
+    let mut ds = Dataset::open(path).await.unwrap();
+    ds.create_index(
+        &["id"],
+        IndexType::BTree,
+        None,
+        &ScalarIndexParams::default(),
+        true,
+    )
+    .await
+    .unwrap();
+    ds
+}
+
+async fn append_rows(path: &str, base_id: i64, n: usize) -> Dataset {
+    let batch = make_batch(base_id, n);
+    let reader = RecordBatchIterator::new(std::iter::once(Ok(batch)), schema());
+    let params = WriteParams {
+        max_rows_per_file: n,
+        max_rows_per_group: n,
+        mode: WriteMode::Append,
+        ..Default::default()
+    };
+    Dataset::write(reader, path, Some(params)).await.unwrap();
+    Dataset::open(path).await.unwrap()
+}
+
+async fn delete_some_indexed_rows(ds: &mut Dataset) {
+    // Delete a sparse pattern that lands in EVERY indexed fragment (one row
+    // per fragment, since ROWS_PER_FRAG = 100 and we delete `id % 100 == 0`).
+    // Each affected fragment gets its own deletion file inside the bitmap,
+    // which is what scales the slow `RoaringBitmap::full()` materialization
+    // path: one allocation per fragment per merge_insert call.
+    ds.delete("id % 100 == 0").await.unwrap();
+}
+
+/// One merge_insert op: 30 updates of existing IDs + 70 inserts of new IDs.
+async fn one_merge_insert(ds: Arc<Dataset>, base_existing: i64, base_new: i64) {
+    let mut ids: Vec<i64> = (0..30).map(|i| base_existing + i).collect();
+    ids.extend(base_new..base_new + 70);
+    let id_arr = Int64Array::from(ids);
+    let batch = RecordBatch::try_new(schema(), vec![Arc::new(id_arr)]).unwrap();
+    let reader = RecordBatchIterator::new(std::iter::once(Ok(batch)), schema());
+
+    let mut builder = MergeInsertBuilder::try_new(ds, vec!["id".to_string()]).unwrap();
+    builder
+        .when_matched(WhenMatched::UpdateAll)
+        .when_not_matched(WhenNotMatched::InsertAll);
+    let job = builder.try_build().unwrap();
+    let _ = job.execute_reader(reader).await.unwrap();
+}
+
+async fn build_clean() -> (TempStrDir, Arc<Dataset>) {
+    let dir = TempStrDir::default();
+    let path = dir.as_str().to_string();
+    let ds = build_indexed_base(&path).await;
+    (dir, Arc::new(ds))
+}
+
+async fn build_with_new_rows_only() -> (TempStrDir, Arc<Dataset>) {
+    let dir = TempStrDir::default();
+    let path = dir.as_str().to_string();
+    build_indexed_base(&path).await;
+    let base = (ROWS_PER_FRAG * NUM_FRAGS) as i64;
+    let ds = append_rows(&path, base, 500).await;
+    (dir, Arc::new(ds))
+}
+
+async fn build_with_deletions_only() -> (TempStrDir, Arc<Dataset>) {
+    let dir = TempStrDir::default();
+    let path = dir.as_str().to_string();
+    let mut ds = build_indexed_base(&path).await;
+    delete_some_indexed_rows(&mut ds).await;
+    (dir, Arc::new(ds))
+}
+
+async fn build_with_new_rows_and_deletions() -> (TempStrDir, Arc<Dataset>) {
+    let dir = TempStrDir::default();
+    let path = dir.as_str().to_string();
+    build_indexed_base(&path).await;
+    let base = (ROWS_PER_FRAG * NUM_FRAGS) as i64;
+    let mut ds = append_rows(&path, base, 500).await;
+    delete_some_indexed_rows(&mut ds).await;
+    (dir, Arc::new(ds))
+}
+
+fn bench_one_shape<F, Fut>(c: &mut Criterion, name: &str, builder: F)
+where
+    F: FnOnce() -> Fut,
+    Fut: std::future::Future<Output = (TempStrDir, Arc<Dataset>)>,
+{
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let (_dir, ds) = rt.block_on(builder());
+    // Cache a snapshot version so each iteration restores to the same baseline.
+    let base_version = ds.version().version;
+    let path = ds.uri().to_string();
+    let total = rt.block_on(ds.count_rows(None)).unwrap() as i64;
+
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            rt.block_on(async {
+                // Restore to the base version so the bench measures a single
+                // merge_insert against the same dataset shape every time.
+                let bench_ds = Dataset::open(&path).await.unwrap();
+                let mut bench_ds = bench_ds.checkout_version(base_version).await.unwrap();
+                bench_ds.restore().await.unwrap();
+                let arc = Arc::new(bench_ds);
+                // base_existing in the indexed range, base_new beyond all data so it's an insert.
+                one_merge_insert(arc, 100, total + 1_000_000).await;
+            })
+        })
+    });
+}
+
+fn bench_merge_insert(c: &mut Criterion) {
+    bench_one_shape(c, "merge_insert/clean", build_clean);
+    bench_one_shape(
+        c,
+        "merge_insert/with_new_rows_only",
+        build_with_new_rows_only,
+    );
+    bench_one_shape(
+        c,
+        "merge_insert/with_deletions_only",
+        build_with_deletions_only,
+    );
+    // The shape that exercises the AllowList(Full) - BlockList(Partial) path.
+    bench_one_shape(
+        c,
+        "merge_insert/with_new_rows_and_deletions",
+        build_with_new_rows_and_deletions,
+    );
+}
+
+#[cfg(target_os = "linux")]
+criterion_group!(
+    name=benches;
+    config = Criterion::default().significance_level(0.1).sample_size(10)
+        .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_merge_insert);
+
+#[cfg(not(target_os = "linux"))]
+criterion_group!(
+    name=benches;
+    config = Criterion::default().significance_level(0.1).sample_size(10);
+    targets = bench_merge_insert);
+
+criterion_main!(benches);
diff --git a/rust/lance/src/index/prefilter.rs b/rust/lance/src/index/prefilter.rs
@@ -241,12 +241,20 @@ impl DatasetPreFilter {
         // outside the index bitmap. This can happen when a fragment's data was
         // modified but the index was not rewritten (e.g. after DataReplacement
         // or partial merge_insert).
-        let needs_allow_list = if restrict_to_fragments {
+        //
+        // We materialize the set of non-index fragments here so the slow path
+        // below can fold them into the deletion mask as Full blocks instead of
+        // computing AllowList(Full) - BlockList(Partial), which forces
+        // RoaringBitmap::full() per fragment in RowAddrTreeMap::sub_assign and
+        // is the dominant cost on every merge_insert call.
+        let non_index_frags: Option<RoaringBitmap> = if restrict_to_fragments {
             let dataset_frag_ids: RoaringBitmap = frag_map.keys().copied().collect();
-            !dataset_frag_ids.is_subset(&fragments)
+            let outside = &dataset_frag_ids - &fragments;
+            (!outside.is_empty()).then_some(outside)
         } else {
-            false
+            None
         };
+        let needs_allow_list = non_index_frags.is_some();
         for frag_id in fragments.iter() {
             let frag = frag_map.get(&frag_id);
             if let Some(frag) = frag {
@@ -270,21 +278,29 @@ impl DatasetPreFilter {
             }
             Some(async move { Ok(Arc::new(RowAddrMask::from_allowed(allow_list))) }.boxed())
         } else {
-            // There are deletions/missing frags. Build the deletion mask and
-            // optionally combine it with the fragment allow-list.
+            // There are deletions/missing frags. Build the deletion mask and,
+            // if needed, fold the non-index fragments into it as Full blocks.
+            // Equivalent to BlockList(deletions) | BlockList(non_index) — but
+            // expressed via insert_fragment so we never materialize a
+            // RoaringBitmap::full() per fragment.
             let fut =
                 Self::do_create_deletion_mask(dataset, missing_frags, frags_with_deletion_files);
-            if needs_allow_list {
+            if let Some(non_index_frags) = non_index_frags {
                 Some(
                     async move {
                         let deletion_mask = fut.await?;
-                        let mut allow_list = RowAddrTreeMap::new();
-                        for frag_id in fragments.iter() {
-                            allow_list.insert_fragment(frag_id);
+                        let mut combined = match &*deletion_mask {
+                            RowAddrMask::BlockList(b) => b.clone(),
+                            RowAddrMask::AllowList(_) => {
+                                // do_create_deletion_mask only returns BlockList; this is
+                                // defensive and should be unreachable.
+                                return Ok(deletion_mask);
+                            }
+                        };
+                        for frag_id in non_index_frags.iter() {
+                            combined.insert_fragment(frag_id);
                         }
-                        Ok(Arc::new(
-                            (*deletion_mask).clone() & RowAddrMask::from_allowed(allow_list),
-                        ))
+                        Ok(Arc::new(RowAddrMask::from_block(combined)))
                     }
                     .boxed(),
                 )