perf: Chunk-based prefiltering (+50% faster on small queries)

dmtrKovalenko · dmtrKovalenko · commit a56c32d564b3 · 2026-03-27T08:52:37.000-07:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/fff-core/src/file_picker.rs b/crates/fff-core/src/file_picker.rs
@@ -939,7 +939,7 @@ fn spawn_scan_and_watcher(
                         if let Some((files, budget)) = snapshot {
                             let bigram_start = std::time::Instant::now();
                             info!("Starting bigram index build for {} files...", files.len());
-                            let index = build_bigram_index(&files, &budget);
+                            let (index, content_binary) = build_bigram_index(&files, &budget);
                             info!(
                                 "Bigram index ready in {:.2}s",
                                 bigram_start.elapsed().as_secs_f64(),
@@ -948,6 +948,14 @@ fn spawn_scan_and_watcher(
                             if let Ok(mut guard) = shared_picker.write()
                                 && let Some(ref mut picker) = *guard
                             {
+                                // Mark files detected as binary by content during
+                                // bigram build so grep never has to re-check them.
+                                for &idx in &content_binary {
+                                    if let Some(file) = picker.sync_data.get_file_mut(idx) {
+                                        file.is_binary = true;
+                                    }
+                                }
+
                                 let file_count = picker.sync_data.files().len();
                                 picker.bigram_index = Some(Arc::new(index));
                                 picker.bigram_overlay = Some(Arc::new(parking_lot::RwLock::new(
@@ -1086,29 +1094,49 @@ fn warmup_mmaps(files: &[FileItem], budget: &ContentCacheBudget) {
 /// For each non-binary, non-empty file: reads content (or uses cached mmap),
 /// populates the per-file bigram bloom filter, and adds it to the inverted index.
 /// Uses rayon for parallel processing.
-pub fn build_bigram_index(files: &[FileItem], budget: &ContentCacheBudget) -> BigramFilter {
+pub fn build_bigram_index(
+    files: &[FileItem],
+    budget: &ContentCacheBudget,
+) -> (BigramFilter, Vec<usize>) {
     let start = std::time::Instant::now();
     info!("Building bigram index for {} files...", files.len());
     let builder = BigramIndexBuilder::new(files.len());
     let max_file_size = budget.max_file_size;
 
+    // Collect indices of files that passed the extension heuristic but are
+    // actually binary (contain NUL bytes). These are marked `is_binary = true`
+    // on the real file list after the build, so grep never has to re-check.
+    let content_binary: std::sync::Mutex<Vec<usize>> = std::sync::Mutex::new(Vec::new());
+
     files.par_iter().enumerate().for_each(|(i, file)| {
         if file.is_binary || file.size == 0 || file.size > max_file_size {
             return;
         }
         // Use cached content if available (no extra memory).
         // For uncached files, use read() instead of mmap() — heap memory is
         // freed immediately on drop, while mmap pages linger in RSS on macOS.
+        let data: Option<&[u8]>;
+        let owned;
         if let Some(cached) = file.get_content(budget) {
-            // Catch binary files not detected by extension heuristic
-            if !detect_binary_content(cached) {
-                builder.add_file_content(i, cached);
+            if detect_binary_content(cached) {
+                content_binary.lock().unwrap().push(i);
+                return;
             }
-        } else if let Ok(data) = std::fs::read(&file.path)
-            && !detect_binary_content(&data)
-        {
-            builder.add_file_content(i, &data);
+            data = Some(cached);
+            owned = None;
+        } else if let Ok(read_data) = std::fs::read(&file.path) {
+            if detect_binary_content(&read_data) {
+                content_binary.lock().unwrap().push(i);
+                return;
+            }
+            data = None;
+            owned = Some(read_data);
+        } else {
+            return;
         }
+
+        let content = data.unwrap_or_else(|| owned.as_ref().unwrap());
+        builder.add_file_content(i, content);
     });
 
     let cols = builder.columns_used();
@@ -1126,7 +1154,16 @@ pub fn build_bigram_index(files: &[FileItem], budget: &ContentCacheBudget) -> Bi
         index.dense_columns(),
         files.len(),
     );
-    index
+
+    let binary_indices = content_binary.into_inner().unwrap();
+    if !binary_indices.is_empty() {
+        info!(
+            "Bigram build detected {} content-binary files (not caught by extension)",
+            binary_indices.len(),
+        );
+    }
+
+    (index, binary_indices)
 }
 
 /// Result of the fast walk phase — files are searchable immediately,
diff --git a/crates/fff-core/src/grep.rs b/crates/fff-core/src/grep.rs
@@ -285,6 +285,8 @@ pub struct GrepResult<'a> {
     pub total_files: usize,
     /// Total number of searchable files (after filtering out binary, too-large, etc.).
     pub filtered_file_count: usize,
+    /// Number of files that contained at least one match.
+    pub files_with_matches: usize,
     /// The file offset to pass for the next page. `0` if there are no more files.
     /// Callers should store this and pass it as `file_offset` in the next call.
     pub next_file_offset: usize,
@@ -1044,7 +1046,12 @@ fn char_indices_to_byte_offsets(line: &str, char_indices: &[usize]) -> SmallVec<
     result
 }
 
-#[tracing::instrument(skip_all, level = Level::DEBUG)]
+/// Minimum chunk size for paginated search. Must be large enough for good
+/// thread utilization across rayon's pool (~28 threads on modern hardware)
+/// but small enough to allow early termination after few chunks.
+const PAGINATED_CHUNK_SIZE: usize = 512;
+
+#[tracing::instrument(skip_all, level = Level::DEBUG, fields(prefiltered_count = files_to_search.len()))]
 fn run_file_search<'a, F>(
     files_to_search: &[&'a FileItem],
     options: &GrepSearchOptions,
@@ -1064,50 +1071,109 @@ where
     };
 
     let search_start = std::time::Instant::now();
+    let page_limit = options.page_limit;
     let budget_exceeded = AtomicBool::new(false);
 
-    // Parallel phase: search all files concurrently using rayon.
-    // Every file is visited (no early-exit gaps), so per_file_results is a
-    // contiguous, order-preserving subset — pagination offsets stay correct.
-    // The time budget acts as the work bound; there is no separate file cap.
-    let per_file_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = files_to_search
-        .par_iter()
-        .enumerate()
-        .filter_map(|(idx, file)| {
-            // Time budget check (relaxed — checked once per file, not per line).
-            if let Some(budget) = time_budget
-                && search_start.elapsed() > budget
-            {
-                budget_exceeded.store(true, Ordering::Relaxed);
-                return None;
-            }
+    // For paginated searches, process files in chunks to enable early
+    // termination. Each chunk is searched in parallel with rayon; between
+    // chunks we check whether enough matches have been collected.
+    //
+    // For full searches (page_limit = MAX), one chunk = all files — same
+    // throughput as before, no overhead from the chunking loop.
+    //
+    // For common queries ("x", "if") with ~99% hit rate: the first 512-file
+    // chunk yields ~500 matches, far exceeding page_limit=50. We stop after
+    // one chunk (~1ms) instead of searching all 93K files (~175ms).
+    let chunk_size = if page_limit < usize::MAX {
+        PAGINATED_CHUNK_SIZE
+    } else {
+        files_to_search.len().max(1)
+    };
 
-            let content = file.get_content_for_search(budget)?;
+    let mut result_files: Vec<&'a FileItem> = Vec::new();
+    let mut all_matches: Vec<GrepMatch> = Vec::new();
+    let mut files_consumed: usize = 0;
+    let mut page_filled = false;
+
+    for chunk in files_to_search.chunks(chunk_size) {
+        let chunk_offset = files_consumed;
+
+        // Parallel phase: search all files in this chunk concurrently.
+        // Within a chunk every file is visited (no gaps), so pagination
+        // offsets remain correct across chunk boundaries.
+        let chunk_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = chunk
+            .par_iter()
+            .enumerate()
+            .filter_map(|(local_idx, file)| {
+                if let Some(budget) = time_budget
+                    && search_start.elapsed() > budget
+                {
+                    budget_exceeded.store(true, Ordering::Relaxed);
+                    return None;
+                }
 
-            // Skip files that are binary but weren't caught by extension heuristic
-            if crate::file_picker::detect_binary_content(&content) {
-                return None;
-            }
+                let content = file.get_content_for_search(budget)?;
+                let file_matches = search_file(&content, options.max_matches_per_file);
+
+                if file_matches.is_empty() {
+                    return None;
+                }
+
+                Some((chunk_offset + local_idx, *file, file_matches))
+            })
+            .collect();
+
+        // Every file in the chunk was visited by rayon (matched or not).
+        files_consumed = chunk_offset + chunk.len();
 
-            let file_matches = search_file(&content, options.max_matches_per_file);
+        // Flatten this chunk's results into the accumulator.
+        for (batch_idx, file, file_matches) in chunk_results {
+            let file_result_idx = result_files.len();
+            result_files.push(file);
 
-            if file_matches.is_empty() {
-                return None;
+            for mut m in file_matches {
+                m.file_index = file_result_idx;
+                all_matches.push(m);
             }
 
-            Some((idx, *file, file_matches))
-        })
-        .collect();
+            if all_matches.len() >= page_limit {
+                // Tighten files_consumed to the file that tipped us over so
+                // the next page resumes right after it.
+                files_consumed = batch_idx + 1;
+                page_filled = true;
+                break;
+            }
+        }
 
-    collect_grep_results(
-        per_file_results,
-        files_to_search.len(),
-        options,
+        if page_filled || budget_exceeded.load(Ordering::Relaxed) {
+            break;
+        }
+    }
+
+    // If no file had any match, we searched the entire slice.
+    if result_files.is_empty() {
+        files_consumed = files_to_search.len();
+    }
+
+    let has_more = budget_exceeded.load(Ordering::Relaxed)
+        || (page_filled && files_consumed < files_to_search.len());
+
+    let next_file_offset = if has_more {
+        options.file_offset + files_consumed
+    } else {
+        0
+    };
+
+    GrepResult {
+        matches: all_matches,
+        files_with_matches: result_files.len(),
+        files: result_files,
+        total_files_searched: files_consumed,
         total_files,
         filtered_file_count,
+        next_file_offset,
         regex_fallback_error,
-        budget_exceeded.load(Ordering::Relaxed),
-    )
+    }
 }
 
 /// Flatten per-file results into the final `GrepResult`.
@@ -1175,6 +1241,7 @@ fn collect_grep_results<'a>(
 
     GrepResult {
         matches: all_matches,
+        files_with_matches: result_files.len(),
         files: result_files,
         total_files_searched: files_consumed,
         total_files,
@@ -1243,7 +1310,7 @@ fn prepare_files_to_search<'a>(
 
 /// Fuzzy grep search using SIMD-accelerated `neo_frizbee::match_list`.
 ///
-/// # Why this doesn't use `grep-searcher` / `GrepSink`
+/// Why this doesn't use `grep-searcher` / `GrepSink`
 ///
 /// PlainText and Regex modes use the `grep-searcher` pipeline: a `Matcher`
 /// finds candidate lines, and a `Sink` collects them one at a time. This
@@ -1294,9 +1361,7 @@ fn fuzzy_grep_search<'a>(
         // Use default gap penalties. Higher values (e.g. 20) cause
         // smith-waterman to prefer *dropping needle chars* over paying
         // gap costs, which inflates the typo count and breaks
-        // transposition matching ("shcema" → "schema" becomes 3 typos
-        // instead of 1). Scattered matches are filtered by max_typos
-        // and the match span check below instead.
+        // transposition matching ("shcema" → "schema" becomes 3 typos instead of 1)
         exact_match_bonus: 100,
         // gap_open_penalty: 4,
         // gap_extend_penalty: 2,
@@ -1321,9 +1386,6 @@ fn fuzzy_grep_search<'a>(
     let perfect_score = (grep_text.len() as u16) * 16;
     let min_score = (perfect_score * 50) / 100;
 
-    // Maximum allowed span of matched characters in the haystack, relative
-    // to needle length.
-    //
     // We allow up to needle_len * 2 to accommodate fuzzy subsequence
     // matches in longer identifiers (e.g. "SortedMap" → "SortedArrayMap"
     // has span 13 for needle 9). Quality is enforced by the density and
@@ -1349,6 +1411,7 @@ fn fuzzy_grep_search<'a>(
             unique_needle_chars.push(hi);
         }
     }
+
     // How many distinct needle chars must appear in the file.
     // With max_typos allowed, we need at least (unique_count - max_typos).
     let unique_count = {
@@ -1360,12 +1423,12 @@ fn fuzzy_grep_search<'a>(
     };
     let min_chars_required = unique_count.saturating_sub(max_typos);
 
-    let _time_budget = if options.time_budget_ms > 0 {
+    let time_budget = if options.time_budget_ms > 0 {
         Some(std::time::Duration::from_millis(options.time_budget_ms))
     } else {
         None
     };
-    let _search_start = std::time::Instant::now();
+    let search_start = std::time::Instant::now();
     let budget_exceeded = AtomicBool::new(false);
     let max_matches_per_file = options.max_matches_per_file;
 
@@ -1378,17 +1441,14 @@ fn fuzzy_grep_search<'a>(
         .map_init(
             || matcher.clone(),
             |matcher, (idx, file)| {
-                // if let Some(budget) = time_budget
-                //     && search_start.elapsed() > budget
-                // {
-                //     budget_exceeded.store(true, Ordering::Relaxed);
-                //     return None;
-                // }
-
-                let file_content = file.get_content_for_search(budget)?;
-                if crate::file_picker::detect_binary_content(&file_content) {
+                if let Some(budget) = time_budget
+                    && search_start.elapsed() > budget
+                {
+                    budget_exceeded.store(true, Ordering::Relaxed);
                     return None;
                 }
+
+                let file_content = file.get_content_for_search(budget)?;
                 let file_bytes: &[u8] = &file_content;
 
                 // File-level prefilter: check if enough distinct needle chars
@@ -1579,6 +1639,7 @@ fn fuzzy_grep_search<'a>(
 ///
 /// When `query` is empty, returns git-modified/untracked files sorted by
 /// frecency for the "welcome state" UI.
+#[tracing::instrument(skip(files, options, budget, bigram_index, bigram_overlay), fields(file_count = files.len()))]
 pub fn grep_search<'a>(
     files: &'a [FileItem],
     query: &FFFQuery<'_>,
diff --git a/crates/fff-core/src/log.rs b/crates/fff-core/src/log.rs
@@ -137,8 +137,8 @@ pub fn init_tracing(log_file_path: &str, log_level: Option<&str>) -> Result<Stri
                     .with_target(true)
                     .with_thread_ids(false)
                     .with_thread_names(false)
-                    .with_file(true)
-                    .with_line_number(true)
+                    // .with_file(true)
+                    // .with_line_number(true)
                     .with_ansi(false)
                     .with_span_events(FmtSpan::NEW | FmtSpan::CLOSE),
             )
diff --git a/crates/fff-core/src/query_tracker.rs b/crates/fff-core/src/query_tracker.rs
@@ -246,7 +246,6 @@ impl QueryTracker {
         min_combo_count: u32,
     ) -> Result<Option<QueryMatchEntry>, Error> {
         let query_key = Self::create_query_key(project_path, query)?;
-        tracing::debug!(?query_key, "HASH");
         let rtxn = self.env.read_txn().map_err(Error::DbStartReadTxn)?;
 
         let last_match = self
diff --git a/crates/fff-query-parser/Cargo.toml b/crates/fff-query-parser/Cargo.toml
@@ -17,3 +17,8 @@ zlob = ["dep:zlob"]
 zlob = { workspace = true, optional = true }
 
 [dev-dependencies]
+criterion = { version = "0.5", features = ["html_reports"] }
+
+[[bench]]
+name = "parse_bench"
+harness = false
diff --git a/crates/fff-query-parser/benches/parse_bench.rs b/crates/fff-query-parser/benches/parse_bench.rs