Skip to content

Commit 3ead91e

Browse files
committed
chore: Improve chunking when reading files for grep
1 parent 7de55be commit 3ead91e

2 files changed

Lines changed: 373 additions & 0 deletions

File tree

crates/fff-nvim/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ tempfile = "3.8"
9393
name = "indexing_and_search"
9494
harness = false
9595

96+
[[bench]]
97+
name = "grep_bench"
98+
harness = false
99+
96100
[[bench]]
97101
name = "query_tracker_bench"
98102
harness = false
Lines changed: 369 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,369 @@
1+
use criterion::{BenchmarkId, Criterion, black_box, criterion_group, criterion_main};
2+
use fff::types::{ContentCacheBudget, FileItem};
3+
use fff::{BigramFilter, GrepMode, GrepSearchOptions, build_bigram_index, grep};
4+
use std::io::Read;
5+
use std::path::Path;
6+
use std::sync::OnceLock;
7+
use std::time::Duration;
8+
9+
struct TestData {
10+
files: Vec<FileItem>,
11+
bigram: BigramFilter,
12+
budget: ContentCacheBudget,
13+
}
14+
15+
static SETUP: OnceLock<TestData> = OnceLock::new();
16+
17+
fn big_repo_path() -> std::path::PathBuf {
18+
if let Some(path) = std::env::var_os("BIG_REPO_PATH") {
19+
return std::path::PathBuf::from(path);
20+
}
21+
22+
let candidates = [
23+
std::path::PathBuf::from("./big-repo"),
24+
std::path::PathBuf::from("../../big-repo"),
25+
];
26+
for p in &candidates {
27+
if p.exists() {
28+
return p.clone();
29+
}
30+
}
31+
panic!(
32+
"./big-repo not found. Run from workspace root:\n \
33+
git clone --depth 1 https://github.com/torvalds/linux.git big-repo"
34+
);
35+
}
36+
37+
fn setup() -> &'static TestData {
38+
SETUP.get_or_init(|| {
39+
let repo = big_repo_path();
40+
let canonical = fff::path_utils::canonicalize(&repo).expect("canonicalize");
41+
42+
eprintln!("Loading files from {:?}...", canonical);
43+
let mut files = load_files(&canonical);
44+
let budget = ContentCacheBudget::new_for_repo(files.len());
45+
46+
// Warm the content cache so warm benchmarks hit OnceLock.
47+
// Use unlimited budget for warmup — we want ALL files cached.
48+
// The repo budget (5k cap for 93k files) would leave most uncached.
49+
eprintln!("Warming content cache for {} files...", files.len());
50+
{
51+
let warmup_budget = ContentCacheBudget::unlimited();
52+
let mut buf = Vec::with_capacity(64 * 1024);
53+
for f in files.iter() {
54+
let _ = f.get_content_for_search(&mut buf, &warmup_budget);
55+
}
56+
}
57+
58+
eprintln!("Building bigram index...");
59+
let (bigram, binary_indices) = build_bigram_index(&files, &budget);
60+
for &i in &binary_indices {
61+
files[i].set_binary(true);
62+
}
63+
64+
let non_binary = files.iter().filter(|f| !f.is_binary()).count();
65+
eprintln!(
66+
"Ready: {} files ({} non-binary), bigram {:.1} MB",
67+
files.len(),
68+
non_binary,
69+
bigram.heap_bytes() as f64 / (1024.0 * 1024.0),
70+
);
71+
72+
TestData {
73+
files,
74+
bigram,
75+
budget,
76+
}
77+
})
78+
}
79+
80+
fn load_files(base_path: &Path) -> Vec<FileItem> {
81+
use ignore::WalkBuilder;
82+
83+
let mut files = Vec::new();
84+
WalkBuilder::new(base_path)
85+
.hidden(false)
86+
.git_ignore(true)
87+
.git_exclude(true)
88+
.git_global(true)
89+
.ignore(true)
90+
.follow_links(false)
91+
.build()
92+
.filter_map(|e| e.ok())
93+
.filter(|e| e.file_type().is_some_and(|ft| ft.is_file()))
94+
.for_each(|entry| {
95+
let path = entry.path().to_path_buf();
96+
let relative = pathdiff::diff_paths(&path, base_path).unwrap_or_else(|| path.clone());
97+
let relative_path = relative.to_string_lossy().into_owned();
98+
let size = entry.metadata().ok().map_or(0, |m| m.len());
99+
let is_binary = detect_binary(&path, size);
100+
101+
let path_string = path.to_string_lossy().into_owned();
102+
let relative_start = (path_string.len() - relative_path.len()) as u16;
103+
let filename_start = path_string
104+
.rfind('/')
105+
.map(|i| i + 1)
106+
.unwrap_or(relative_start as usize) as u16;
107+
files.push(FileItem::new_raw(
108+
path_string,
109+
relative_start,
110+
filename_start,
111+
size,
112+
0,
113+
None,
114+
is_binary,
115+
));
116+
});
117+
118+
files
119+
}
120+
121+
fn detect_binary(path: &Path, size: u64) -> bool {
122+
if size == 0 {
123+
return false;
124+
}
125+
let Ok(file) = std::fs::File::open(path) else {
126+
return false;
127+
};
128+
let mut reader = std::io::BufReader::with_capacity(1024, file);
129+
let mut buf = [0u8; 512];
130+
let n = reader.read(&mut buf).unwrap_or(0);
131+
buf[..n].contains(&0)
132+
}
133+
134+
fn plain_options() -> GrepSearchOptions {
135+
GrepSearchOptions {
136+
max_file_size: 10 * 1024 * 1024,
137+
max_matches_per_file: 200,
138+
smart_case: true,
139+
file_offset: 0,
140+
page_limit: 50,
141+
mode: GrepMode::PlainText,
142+
time_budget_ms: 0,
143+
before_context: 0,
144+
after_context: 0,
145+
classify_definitions: false,
146+
trim_whitespace: false,
147+
}
148+
}
149+
150+
fn fuzzy_options() -> GrepSearchOptions {
151+
GrepSearchOptions {
152+
mode: GrepMode::Fuzzy,
153+
..plain_options()
154+
}
155+
}
156+
157+
fn do_grep(
158+
files: &[FileItem],
159+
query: &str,
160+
options: &GrepSearchOptions,
161+
budget: &ContentCacheBudget,
162+
bigram: Option<&BigramFilter>,
163+
) -> usize {
164+
let parsed = grep::parse_grep_query(query);
165+
let result = grep::grep_search(
166+
black_box(files),
167+
black_box(&parsed),
168+
black_box(options),
169+
budget,
170+
bigram,
171+
None,
172+
None,
173+
);
174+
result.matches.len()
175+
}
176+
177+
fn bench_plain_warm(c: &mut Criterion) {
178+
let test_picker = setup();
179+
let opts = plain_options();
180+
181+
let queries: &[(&str, &str)] = &[
182+
("2char_if", "if"),
183+
("common_return", "return"),
184+
("func_mutex_lock", "mutex_lock"),
185+
("struct_inode_ops", "inode_operations"),
186+
("define_MODULE_LICENSE", "MODULE_LICENSE"),
187+
("rare_phylink_ethtool", "phylink_ethtool"),
188+
("include", "#include"),
189+
("comment_TODO", "TODO"),
190+
("type_struct_file", "struct file"),
191+
("error_EINVAL", "err = -EINVAL"),
192+
("long_static_int_init", "static int __init"),
193+
("very_common_int", "int"),
194+
("single_char_x", "x"),
195+
("path_printk_c", "printk *.c"),
196+
("dir_mutex_kernel", "mutex /kernel/"),
197+
];
198+
199+
let mut group = c.benchmark_group("plain_warm");
200+
group.sample_size(30);
201+
group.warm_up_time(Duration::from_secs(2));
202+
group.measurement_time(Duration::from_secs(5));
203+
204+
for (name, query) in queries {
205+
group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| {
206+
b.iter(|| do_grep(&test_picker.files, q, &opts, &test_picker.budget, None))
207+
});
208+
}
209+
210+
group.finish();
211+
}
212+
213+
fn bench_bigram_warm(c: &mut Criterion) {
214+
let test_picker = setup();
215+
let opts = plain_options();
216+
217+
let queries: &[(&str, &str)] = &[
218+
("2char_if", "if"),
219+
("common_return", "return"),
220+
("func_mutex_lock", "mutex_lock"),
221+
("struct_inode_ops", "inode_operations"),
222+
("define_MODULE_LICENSE", "MODULE_LICENSE"),
223+
("rare_phylink_ethtool", "phylink_ethtool"),
224+
("include", "#include"),
225+
("comment_TODO", "TODO"),
226+
("type_struct_file", "struct file"),
227+
("error_EINVAL", "err = -EINVAL"),
228+
("long_static_int_init", "static int __init"),
229+
("very_common_int", "int"),
230+
("single_char_x", "x"),
231+
("path_printk_c", "printk *.c"),
232+
("dir_mutex_kernel", "mutex /kernel/"),
233+
];
234+
235+
let mut group = c.benchmark_group("bigram_warm");
236+
group.sample_size(30);
237+
group.warm_up_time(Duration::from_secs(2));
238+
group.measurement_time(Duration::from_secs(5));
239+
240+
for (name, query) in queries {
241+
group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| {
242+
b.iter(|| {
243+
do_grep(
244+
&test_picker.files,
245+
q,
246+
&opts,
247+
&test_picker.budget,
248+
Some(&test_picker.bigram),
249+
)
250+
})
251+
});
252+
}
253+
254+
group.finish();
255+
}
256+
257+
fn bench_fuzzy_warm(c: &mut Criterion) {
258+
let test_picker = setup();
259+
let opts = fuzzy_options();
260+
261+
let queries: &[(&str, &str)] = &[
262+
("exact_mutex_lock", "mutex_lock"),
263+
("typo_mutx_lock", "mutx_lock"),
264+
("camel_InodeOps", "InodeOps"),
265+
("abbrev_sched_rt", "sched_rt"),
266+
("short_kfr", "kfr"),
267+
("common_return", "return"),
268+
("define_MODULE_LICENSE", "MODULE_LICENSE"),
269+
("struct_file_ops", "file_operations"),
270+
("long_static_int_init", "static_int_init"),
271+
("path_printk_c", "printk *.c"),
272+
];
273+
274+
let mut group = c.benchmark_group("fuzzy_warm");
275+
group.sample_size(10);
276+
group.warm_up_time(Duration::from_secs(2));
277+
group.measurement_time(Duration::from_secs(8));
278+
279+
for (name, query) in queries {
280+
group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| {
281+
b.iter(|| do_grep(&test_picker.files, q, &opts, &test_picker.budget, None))
282+
});
283+
}
284+
285+
group.finish();
286+
}
287+
288+
fn bench_fuzzy_bigram_warm(c: &mut Criterion) {
289+
let test_picker = setup();
290+
let opts = fuzzy_options();
291+
292+
let queries: &[(&str, &str)] = &[
293+
("exact_mutex_lock", "mutex_lock"),
294+
("typo_mutx_lock", "mutx_lock"),
295+
("camel_InodeOps", "InodeOps"),
296+
("abbrev_sched_rt", "sched_rt"),
297+
("short_kfr", "kfr"),
298+
("common_return", "return"),
299+
("define_MODULE_LICENSE", "MODULE_LICENSE"),
300+
("struct_file_ops", "file_operations"),
301+
("long_static_int_init", "static_int_init"),
302+
("path_printk_c", "printk *.c"),
303+
];
304+
305+
let mut group = c.benchmark_group("fuzzy_bigram_warm");
306+
group.sample_size(10);
307+
group.warm_up_time(Duration::from_secs(2));
308+
group.measurement_time(Duration::from_secs(8));
309+
310+
for (name, query) in queries {
311+
group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| {
312+
b.iter(|| {
313+
do_grep(
314+
&test_picker.files,
315+
q,
316+
&opts,
317+
&test_picker.budget,
318+
Some(&test_picker.bigram),
319+
)
320+
})
321+
});
322+
}
323+
324+
group.finish();
325+
}
326+
327+
fn bench_plain_cold(c: &mut Criterion) {
328+
let test_picker = setup();
329+
let opts = plain_options();
330+
331+
let queries: &[(&str, &str)] = &[
332+
("2char_if", "if"),
333+
("common_return", "return"),
334+
("func_mutex_lock", "mutex_lock"),
335+
("struct_inode_ops", "inode_operations"),
336+
("define_MODULE_LICENSE", "MODULE_LICENSE"),
337+
("rare_phylink_ethtool", "phylink_ethtool"),
338+
("long_static_int_init", "static int __init"),
339+
];
340+
341+
let mut group = c.benchmark_group("plain_cold");
342+
group.sample_size(10);
343+
group.warm_up_time(Duration::from_millis(500));
344+
group.measurement_time(Duration::from_secs(10));
345+
346+
let canonical = fff::path_utils::canonicalize(&big_repo_path()).expect("canonicalize");
347+
348+
for (name, query) in queries {
349+
group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| {
350+
b.iter_with_setup(
351+
|| load_files(&canonical),
352+
|fresh_files| do_grep(&fresh_files, q, &opts, &test_picker.budget, None),
353+
);
354+
});
355+
}
356+
357+
group.finish();
358+
}
359+
360+
criterion_group!(
361+
benches,
362+
bench_plain_warm,
363+
bench_bigram_warm,
364+
bench_fuzzy_warm,
365+
bench_fuzzy_bigram_warm,
366+
bench_plain_cold,
367+
);
368+
369+
criterion_main!(benches);

0 commit comments

Comments
 (0)