Skip to content

Commit a56c32d

Browse files
committed
perf: Chunk-based prefiltering (+50% faster on small queries)
1 parent eb577ea commit a56c32d

7 files changed

Lines changed: 347 additions & 64 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/fff-core/src/file_picker.rs

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,7 @@ fn spawn_scan_and_watcher(
939939
if let Some((files, budget)) = snapshot {
940940
let bigram_start = std::time::Instant::now();
941941
info!("Starting bigram index build for {} files...", files.len());
942-
let index = build_bigram_index(&files, &budget);
942+
let (index, content_binary) = build_bigram_index(&files, &budget);
943943
info!(
944944
"Bigram index ready in {:.2}s",
945945
bigram_start.elapsed().as_secs_f64(),
@@ -948,6 +948,14 @@ fn spawn_scan_and_watcher(
948948
if let Ok(mut guard) = shared_picker.write()
949949
&& let Some(ref mut picker) = *guard
950950
{
951+
// Mark files detected as binary by content during
952+
// bigram build so grep never has to re-check them.
953+
for &idx in &content_binary {
954+
if let Some(file) = picker.sync_data.get_file_mut(idx) {
955+
file.is_binary = true;
956+
}
957+
}
958+
951959
let file_count = picker.sync_data.files().len();
952960
picker.bigram_index = Some(Arc::new(index));
953961
picker.bigram_overlay = Some(Arc::new(parking_lot::RwLock::new(
@@ -1086,29 +1094,49 @@ fn warmup_mmaps(files: &[FileItem], budget: &ContentCacheBudget) {
10861094
/// For each non-binary, non-empty file: reads content (or uses cached mmap),
10871095
/// populates the per-file bigram bloom filter, and adds it to the inverted index.
10881096
/// Uses rayon for parallel processing.
1089-
pub fn build_bigram_index(files: &[FileItem], budget: &ContentCacheBudget) -> BigramFilter {
1097+
pub fn build_bigram_index(
1098+
files: &[FileItem],
1099+
budget: &ContentCacheBudget,
1100+
) -> (BigramFilter, Vec<usize>) {
10901101
let start = std::time::Instant::now();
10911102
info!("Building bigram index for {} files...", files.len());
10921103
let builder = BigramIndexBuilder::new(files.len());
10931104
let max_file_size = budget.max_file_size;
10941105

1106+
// Collect indices of files that passed the extension heuristic but are
1107+
// actually binary (contain NUL bytes). These are marked `is_binary = true`
1108+
// on the real file list after the build, so grep never has to re-check.
1109+
let content_binary: std::sync::Mutex<Vec<usize>> = std::sync::Mutex::new(Vec::new());
1110+
10951111
files.par_iter().enumerate().for_each(|(i, file)| {
10961112
if file.is_binary || file.size == 0 || file.size > max_file_size {
10971113
return;
10981114
}
10991115
// Use cached content if available (no extra memory).
11001116
// For uncached files, use read() instead of mmap() — heap memory is
11011117
// freed immediately on drop, while mmap pages linger in RSS on macOS.
1118+
let data: Option<&[u8]>;
1119+
let owned;
11021120
if let Some(cached) = file.get_content(budget) {
1103-
// Catch binary files not detected by extension heuristic
1104-
if !detect_binary_content(cached) {
1105-
builder.add_file_content(i, cached);
1121+
if detect_binary_content(cached) {
1122+
content_binary.lock().unwrap().push(i);
1123+
return;
11061124
}
1107-
} else if let Ok(data) = std::fs::read(&file.path)
1108-
&& !detect_binary_content(&data)
1109-
{
1110-
builder.add_file_content(i, &data);
1125+
data = Some(cached);
1126+
owned = None;
1127+
} else if let Ok(read_data) = std::fs::read(&file.path) {
1128+
if detect_binary_content(&read_data) {
1129+
content_binary.lock().unwrap().push(i);
1130+
return;
1131+
}
1132+
data = None;
1133+
owned = Some(read_data);
1134+
} else {
1135+
return;
11111136
}
1137+
1138+
let content = data.unwrap_or_else(|| owned.as_ref().unwrap());
1139+
builder.add_file_content(i, content);
11121140
});
11131141

11141142
let cols = builder.columns_used();
@@ -1126,7 +1154,16 @@ pub fn build_bigram_index(files: &[FileItem], budget: &ContentCacheBudget) -> Bi
11261154
index.dense_columns(),
11271155
files.len(),
11281156
);
1129-
index
1157+
1158+
let binary_indices = content_binary.into_inner().unwrap();
1159+
if !binary_indices.is_empty() {
1160+
info!(
1161+
"Bigram build detected {} content-binary files (not caught by extension)",
1162+
binary_indices.len(),
1163+
);
1164+
}
1165+
1166+
(index, binary_indices)
11301167
}
11311168

11321169
/// Result of the fast walk phase — files are searchable immediately,

crates/fff-core/src/grep.rs

Lines changed: 112 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,8 @@ pub struct GrepResult<'a> {
285285
pub total_files: usize,
286286
/// Total number of searchable files (after filtering out binary, too-large, etc.).
287287
pub filtered_file_count: usize,
288+
/// Number of files that contained at least one match.
289+
pub files_with_matches: usize,
288290
/// The file offset to pass for the next page. `0` if there are no more files.
289291
/// Callers should store this and pass it as `file_offset` in the next call.
290292
pub next_file_offset: usize,
@@ -1044,7 +1046,12 @@ fn char_indices_to_byte_offsets(line: &str, char_indices: &[usize]) -> SmallVec<
10441046
result
10451047
}
10461048

1047-
#[tracing::instrument(skip_all, level = Level::DEBUG)]
1049+
/// Minimum chunk size for paginated search. Must be large enough for good
1050+
/// thread utilization across rayon's pool (~28 threads on modern hardware)
1051+
/// but small enough to allow early termination after few chunks.
1052+
const PAGINATED_CHUNK_SIZE: usize = 512;
1053+
1054+
#[tracing::instrument(skip_all, level = Level::DEBUG, fields(prefiltered_count = files_to_search.len()))]
10481055
fn run_file_search<'a, F>(
10491056
files_to_search: &[&'a FileItem],
10501057
options: &GrepSearchOptions,
@@ -1064,50 +1071,109 @@ where
10641071
};
10651072

10661073
let search_start = std::time::Instant::now();
1074+
let page_limit = options.page_limit;
10671075
let budget_exceeded = AtomicBool::new(false);
10681076

1069-
// Parallel phase: search all files concurrently using rayon.
1070-
// Every file is visited (no early-exit gaps), so per_file_results is a
1071-
// contiguous, order-preserving subset — pagination offsets stay correct.
1072-
// The time budget acts as the work bound; there is no separate file cap.
1073-
let per_file_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = files_to_search
1074-
.par_iter()
1075-
.enumerate()
1076-
.filter_map(|(idx, file)| {
1077-
// Time budget check (relaxed — checked once per file, not per line).
1078-
if let Some(budget) = time_budget
1079-
&& search_start.elapsed() > budget
1080-
{
1081-
budget_exceeded.store(true, Ordering::Relaxed);
1082-
return None;
1083-
}
1077+
// For paginated searches, process files in chunks to enable early
1078+
// termination. Each chunk is searched in parallel with rayon; between
1079+
// chunks we check whether enough matches have been collected.
1080+
//
1081+
// For full searches (page_limit = MAX), one chunk = all files — same
1082+
// throughput as before, no overhead from the chunking loop.
1083+
//
1084+
// For common queries ("x", "if") with ~99% hit rate: the first 512-file
1085+
// chunk yields ~500 matches, far exceeding page_limit=50. We stop after
1086+
// one chunk (~1ms) instead of searching all 93K files (~175ms).
1087+
let chunk_size = if page_limit < usize::MAX {
1088+
PAGINATED_CHUNK_SIZE
1089+
} else {
1090+
files_to_search.len().max(1)
1091+
};
10841092

1085-
let content = file.get_content_for_search(budget)?;
1093+
let mut result_files: Vec<&'a FileItem> = Vec::new();
1094+
let mut all_matches: Vec<GrepMatch> = Vec::new();
1095+
let mut files_consumed: usize = 0;
1096+
let mut page_filled = false;
1097+
1098+
for chunk in files_to_search.chunks(chunk_size) {
1099+
let chunk_offset = files_consumed;
1100+
1101+
// Parallel phase: search all files in this chunk concurrently.
1102+
// Within a chunk every file is visited (no gaps), so pagination
1103+
// offsets remain correct across chunk boundaries.
1104+
let chunk_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = chunk
1105+
.par_iter()
1106+
.enumerate()
1107+
.filter_map(|(local_idx, file)| {
1108+
if let Some(budget) = time_budget
1109+
&& search_start.elapsed() > budget
1110+
{
1111+
budget_exceeded.store(true, Ordering::Relaxed);
1112+
return None;
1113+
}
10861114

1087-
// Skip files that are binary but weren't caught by extension heuristic
1088-
if crate::file_picker::detect_binary_content(&content) {
1089-
return None;
1090-
}
1115+
let content = file.get_content_for_search(budget)?;
1116+
let file_matches = search_file(&content, options.max_matches_per_file);
1117+
1118+
if file_matches.is_empty() {
1119+
return None;
1120+
}
1121+
1122+
Some((chunk_offset + local_idx, *file, file_matches))
1123+
})
1124+
.collect();
1125+
1126+
// Every file in the chunk was visited by rayon (matched or not).
1127+
files_consumed = chunk_offset + chunk.len();
10911128

1092-
let file_matches = search_file(&content, options.max_matches_per_file);
1129+
// Flatten this chunk's results into the accumulator.
1130+
for (batch_idx, file, file_matches) in chunk_results {
1131+
let file_result_idx = result_files.len();
1132+
result_files.push(file);
10931133

1094-
if file_matches.is_empty() {
1095-
return None;
1134+
for mut m in file_matches {
1135+
m.file_index = file_result_idx;
1136+
all_matches.push(m);
10961137
}
10971138

1098-
Some((idx, *file, file_matches))
1099-
})
1100-
.collect();
1139+
if all_matches.len() >= page_limit {
1140+
// Tighten files_consumed to the file that tipped us over so
1141+
// the next page resumes right after it.
1142+
files_consumed = batch_idx + 1;
1143+
page_filled = true;
1144+
break;
1145+
}
1146+
}
11011147

1102-
collect_grep_results(
1103-
per_file_results,
1104-
files_to_search.len(),
1105-
options,
1148+
if page_filled || budget_exceeded.load(Ordering::Relaxed) {
1149+
break;
1150+
}
1151+
}
1152+
1153+
// If no file had any match, we searched the entire slice.
1154+
if result_files.is_empty() {
1155+
files_consumed = files_to_search.len();
1156+
}
1157+
1158+
let has_more = budget_exceeded.load(Ordering::Relaxed)
1159+
|| (page_filled && files_consumed < files_to_search.len());
1160+
1161+
let next_file_offset = if has_more {
1162+
options.file_offset + files_consumed
1163+
} else {
1164+
0
1165+
};
1166+
1167+
GrepResult {
1168+
matches: all_matches,
1169+
files_with_matches: result_files.len(),
1170+
files: result_files,
1171+
total_files_searched: files_consumed,
11061172
total_files,
11071173
filtered_file_count,
1174+
next_file_offset,
11081175
regex_fallback_error,
1109-
budget_exceeded.load(Ordering::Relaxed),
1110-
)
1176+
}
11111177
}
11121178

11131179
/// Flatten per-file results into the final `GrepResult`.
@@ -1175,6 +1241,7 @@ fn collect_grep_results<'a>(
11751241

11761242
GrepResult {
11771243
matches: all_matches,
1244+
files_with_matches: result_files.len(),
11781245
files: result_files,
11791246
total_files_searched: files_consumed,
11801247
total_files,
@@ -1243,7 +1310,7 @@ fn prepare_files_to_search<'a>(
12431310

12441311
/// Fuzzy grep search using SIMD-accelerated `neo_frizbee::match_list`.
12451312
///
1246-
/// # Why this doesn't use `grep-searcher` / `GrepSink`
1313+
/// Why this doesn't use `grep-searcher` / `GrepSink`
12471314
///
12481315
/// PlainText and Regex modes use the `grep-searcher` pipeline: a `Matcher`
12491316
/// finds candidate lines, and a `Sink` collects them one at a time. This
@@ -1294,9 +1361,7 @@ fn fuzzy_grep_search<'a>(
12941361
// Use default gap penalties. Higher values (e.g. 20) cause
12951362
// smith-waterman to prefer *dropping needle chars* over paying
12961363
// gap costs, which inflates the typo count and breaks
1297-
// transposition matching ("shcema" → "schema" becomes 3 typos
1298-
// instead of 1). Scattered matches are filtered by max_typos
1299-
// and the match span check below instead.
1364+
// transposition matching ("shcema" → "schema" becomes 3 typos instead of 1)
13001365
exact_match_bonus: 100,
13011366
// gap_open_penalty: 4,
13021367
// gap_extend_penalty: 2,
@@ -1321,9 +1386,6 @@ fn fuzzy_grep_search<'a>(
13211386
let perfect_score = (grep_text.len() as u16) * 16;
13221387
let min_score = (perfect_score * 50) / 100;
13231388

1324-
// Maximum allowed span of matched characters in the haystack, relative
1325-
// to needle length.
1326-
//
13271389
// We allow up to needle_len * 2 to accommodate fuzzy subsequence
13281390
// matches in longer identifiers (e.g. "SortedMap" → "SortedArrayMap"
13291391
// has span 13 for needle 9). Quality is enforced by the density and
@@ -1349,6 +1411,7 @@ fn fuzzy_grep_search<'a>(
13491411
unique_needle_chars.push(hi);
13501412
}
13511413
}
1414+
13521415
// How many distinct needle chars must appear in the file.
13531416
// With max_typos allowed, we need at least (unique_count - max_typos).
13541417
let unique_count = {
@@ -1360,12 +1423,12 @@ fn fuzzy_grep_search<'a>(
13601423
};
13611424
let min_chars_required = unique_count.saturating_sub(max_typos);
13621425

1363-
let _time_budget = if options.time_budget_ms > 0 {
1426+
let time_budget = if options.time_budget_ms > 0 {
13641427
Some(std::time::Duration::from_millis(options.time_budget_ms))
13651428
} else {
13661429
None
13671430
};
1368-
let _search_start = std::time::Instant::now();
1431+
let search_start = std::time::Instant::now();
13691432
let budget_exceeded = AtomicBool::new(false);
13701433
let max_matches_per_file = options.max_matches_per_file;
13711434

@@ -1378,17 +1441,14 @@ fn fuzzy_grep_search<'a>(
13781441
.map_init(
13791442
|| matcher.clone(),
13801443
|matcher, (idx, file)| {
1381-
// if let Some(budget) = time_budget
1382-
// && search_start.elapsed() > budget
1383-
// {
1384-
// budget_exceeded.store(true, Ordering::Relaxed);
1385-
// return None;
1386-
// }
1387-
1388-
let file_content = file.get_content_for_search(budget)?;
1389-
if crate::file_picker::detect_binary_content(&file_content) {
1444+
if let Some(budget) = time_budget
1445+
&& search_start.elapsed() > budget
1446+
{
1447+
budget_exceeded.store(true, Ordering::Relaxed);
13901448
return None;
13911449
}
1450+
1451+
let file_content = file.get_content_for_search(budget)?;
13921452
let file_bytes: &[u8] = &file_content;
13931453

13941454
// File-level prefilter: check if enough distinct needle chars
@@ -1579,6 +1639,7 @@ fn fuzzy_grep_search<'a>(
15791639
///
15801640
/// When `query` is empty, returns git-modified/untracked files sorted by
15811641
/// frecency for the "welcome state" UI.
1642+
#[tracing::instrument(skip(files, options, budget, bigram_index, bigram_overlay), fields(file_count = files.len()))]
15821643
pub fn grep_search<'a>(
15831644
files: &'a [FileItem],
15841645
query: &FFFQuery<'_>,

crates/fff-core/src/log.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,8 @@ pub fn init_tracing(log_file_path: &str, log_level: Option<&str>) -> Result<Stri
137137
.with_target(true)
138138
.with_thread_ids(false)
139139
.with_thread_names(false)
140-
.with_file(true)
141-
.with_line_number(true)
140+
// .with_file(true)
141+
// .with_line_number(true)
142142
.with_ansi(false)
143143
.with_span_events(FmtSpan::NEW | FmtSpan::CLOSE),
144144
)

crates/fff-core/src/query_tracker.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,6 @@ impl QueryTracker {
246246
min_combo_count: u32,
247247
) -> Result<Option<QueryMatchEntry>, Error> {
248248
let query_key = Self::create_query_key(project_path, query)?;
249-
tracing::debug!(?query_key, "HASH");
250249
let rtxn = self.env.read_txn().map_err(Error::DbStartReadTxn)?;
251250

252251
let last_match = self

crates/fff-query-parser/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,8 @@ zlob = ["dep:zlob"]
1717
zlob = { workspace = true, optional = true }
1818

1919
[dev-dependencies]
20+
criterion = { version = "0.5", features = ["html_reports"] }
21+
22+
[[bench]]
23+
name = "parse_bench"
24+
harness = false

0 commit comments

Comments
 (0)