Skip to content

Commit 16b3d76

Browse files
committed
perf: Reduce memory footprint of grep index
1 parent c594421 commit 16b3d76

9 files changed

Lines changed: 447 additions & 158 deletions

File tree

crates/fff-c/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,7 @@ pub unsafe extern "C" fn fff_wait_for_scan(
606606
Err(e) => return e,
607607
};
608608

609-
let completed = FilePicker::wait_for_scan(&picker, Duration::from_millis(timeout_ms));
609+
let completed = FilePicker::wait_for_scan(picker, Duration::from_millis(timeout_ms));
610610
FffResult::ok_int(completed as i64)
611611
}
612612

crates/fff-core/benches/bigram_bench.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,9 @@ fn bench_bigram_query(c: &mut Criterion) {
2323
for &file_count in &file_counts {
2424
let index = build_test_index(file_count);
2525
eprintln!(
26-
"Index ({} files): {} columns ({} dense, {} sparse)",
26+
"Index ({} files): {} columns",
2727
file_count,
2828
index.columns_used(),
29-
index.dense_columns(),
30-
index.sparse_columns(),
3129
);
3230

3331
let mut group = c.benchmark_group(format!("bigram_query_{file_count}"));

crates/fff-core/src/file_picker.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,6 +1172,7 @@ pub fn build_bigram_index(
11721172
let start = std::time::Instant::now();
11731173
info!("Building bigram index for {} files...", files.len());
11741174
let builder = BigramIndexBuilder::new(files.len());
1175+
let skip_builder = BigramIndexBuilder::new(files.len());
11751176
let max_file_size = budget.max_file_size;
11761177

11771178
// Collect indices of files that passed the extension heuristic but are
@@ -1208,21 +1209,25 @@ pub fn build_bigram_index(
12081209

12091210
let content = data.unwrap_or_else(|| owned.as_ref().unwrap());
12101211
builder.add_file_content(i, content);
1212+
skip_builder.add_file_content_skip(i, content);
12111213
});
12121214

12131215
let cols = builder.columns_used();
1214-
let index = builder.compress();
1216+
let mut index = builder.compress();
1217+
// Skip index: skip bigrams are inherently less specific than consecutive
1218+
// bigrams, so relevant columns are almost always dense. Dense-only saves
1219+
// ~20% memory vs all columns with no loss in filtering.
1220+
let skip_index = skip_builder.compress();
1221+
index.set_skip_index(skip_index);
12151222

12161223
// The builder just freed ~276 MB (for 500k files) of atomic bitsets.
12171224
// Hint the allocator to return those pages to the OS.
12181225
hint_allocator_collect();
12191226

12201227
info!(
1221-
"Bigram index built in {:.2}s — {} columns ({} sparse, {} dense) for {} files",
1228+
"Bigram index built in {:.2}s — {} dense columns for {} files",
12221229
start.elapsed().as_secs_f64(),
12231230
cols,
1224-
index.sparse_columns(),
1225-
index.dense_columns(),
12261231
files.len(),
12271232
);
12281233

crates/fff-core/src/grep.rs

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1786,7 +1786,7 @@ pub fn grep_search<'a>(
17861786
// it is important that this step is coming as early as possible
17871787
let (files_to_search, filtered_file_count) = match bigram_candidates {
17881788
Some(ref candidates) if constraints_from_query.is_empty() => {
1789-
// this call is essentially free and much more efficient than alowing a recollection
1789+
// this call is essentially free and much more efficient than allowing a recollection
17901790
let cap = BigramFilter::count_candidates(candidates);
17911791
let mut result: Vec<&FileItem> = Vec::with_capacity(cap);
17921792
for (word_idx, &word) in candidates.iter().enumerate() {
@@ -1875,20 +1875,15 @@ pub fn grep_search<'a>(
18751875
}
18761876
.build();
18771877

1878-
// Dispatch to the appropriate sink type at the boundary — zero runtime
1879-
// branching inside the per-line hot path.
1880-
//
1881-
// When not in regex mode, pass the memmem finder as a whole-file prefilter.
1882-
// A single SIMD memmem scan rejects non-matching files before the
1883-
// grep-searcher allocates state and splits lines — ~0.3us vs ~7us per file.
1884-
let prefilter = if regex.is_none() { Some(&finder) } else { None };
1878+
// prefilter looks for the literal occurrence if the search is case insensitive
1879+
let should_perfilter = regex.is_none() && !case_insensitive;
18851880
let mut result = perform_grep(
18861881
&files_to_search,
18871882
options,
18881883
total_files,
18891884
filtered_file_count,
18901885
budget,
1891-
prefilter,
1886+
should_perfilter.then_some(&finder),
18921887
|file_bytes: &[u8], max_matches: usize| {
18931888
let state = SinkState {
18941889
file_index: 0, // set by run_file_search

0 commit comments

Comments
 (0)