|
| 1 | +use criterion::{BenchmarkId, Criterion, black_box, criterion_group, criterion_main}; |
| 2 | +use fff::types::{ContentCacheBudget, FileItem}; |
| 3 | +use fff::{BigramFilter, GrepMode, GrepSearchOptions, build_bigram_index, grep}; |
| 4 | +use std::io::Read; |
| 5 | +use std::path::Path; |
| 6 | +use std::sync::OnceLock; |
| 7 | +use std::time::Duration; |
| 8 | + |
| 9 | +struct TestData { |
| 10 | + files: Vec<FileItem>, |
| 11 | + bigram: BigramFilter, |
| 12 | + budget: ContentCacheBudget, |
| 13 | +} |
| 14 | + |
| 15 | +static SETUP: OnceLock<TestData> = OnceLock::new(); |
| 16 | + |
| 17 | +fn big_repo_path() -> std::path::PathBuf { |
| 18 | + if let Some(path) = std::env::var_os("BIG_REPO_PATH") { |
| 19 | + return std::path::PathBuf::from(path); |
| 20 | + } |
| 21 | + |
| 22 | + let candidates = [ |
| 23 | + std::path::PathBuf::from("./big-repo"), |
| 24 | + std::path::PathBuf::from("../../big-repo"), |
| 25 | + ]; |
| 26 | + for p in &candidates { |
| 27 | + if p.exists() { |
| 28 | + return p.clone(); |
| 29 | + } |
| 30 | + } |
| 31 | + panic!( |
| 32 | + "./big-repo not found. Run from workspace root:\n \ |
| 33 | + git clone --depth 1 https://github.com/torvalds/linux.git big-repo" |
| 34 | + ); |
| 35 | +} |
| 36 | + |
| 37 | +fn setup() -> &'static TestData { |
| 38 | + SETUP.get_or_init(|| { |
| 39 | + let repo = big_repo_path(); |
| 40 | + let canonical = fff::path_utils::canonicalize(&repo).expect("canonicalize"); |
| 41 | + |
| 42 | + eprintln!("Loading files from {:?}...", canonical); |
| 43 | + let mut files = load_files(&canonical); |
| 44 | + let budget = ContentCacheBudget::new_for_repo(files.len()); |
| 45 | + |
| 46 | + // Warm the content cache so warm benchmarks hit OnceLock. |
| 47 | + // Use unlimited budget for warmup — we want ALL files cached. |
| 48 | + // The repo budget (5k cap for 93k files) would leave most uncached. |
| 49 | + eprintln!("Warming content cache for {} files...", files.len()); |
| 50 | + { |
| 51 | + let warmup_budget = ContentCacheBudget::unlimited(); |
| 52 | + let mut buf = Vec::with_capacity(64 * 1024); |
| 53 | + for f in files.iter() { |
| 54 | + let _ = f.get_content_for_search(&mut buf, &warmup_budget); |
| 55 | + } |
| 56 | + } |
| 57 | + |
| 58 | + eprintln!("Building bigram index..."); |
| 59 | + let (bigram, binary_indices) = build_bigram_index(&files, &budget); |
| 60 | + for &i in &binary_indices { |
| 61 | + files[i].set_binary(true); |
| 62 | + } |
| 63 | + |
| 64 | + let non_binary = files.iter().filter(|f| !f.is_binary()).count(); |
| 65 | + eprintln!( |
| 66 | + "Ready: {} files ({} non-binary), bigram {:.1} MB", |
| 67 | + files.len(), |
| 68 | + non_binary, |
| 69 | + bigram.heap_bytes() as f64 / (1024.0 * 1024.0), |
| 70 | + ); |
| 71 | + |
| 72 | + TestData { |
| 73 | + files, |
| 74 | + bigram, |
| 75 | + budget, |
| 76 | + } |
| 77 | + }) |
| 78 | +} |
| 79 | + |
| 80 | +fn load_files(base_path: &Path) -> Vec<FileItem> { |
| 81 | + use ignore::WalkBuilder; |
| 82 | + |
| 83 | + let mut files = Vec::new(); |
| 84 | + WalkBuilder::new(base_path) |
| 85 | + .hidden(false) |
| 86 | + .git_ignore(true) |
| 87 | + .git_exclude(true) |
| 88 | + .git_global(true) |
| 89 | + .ignore(true) |
| 90 | + .follow_links(false) |
| 91 | + .build() |
| 92 | + .filter_map(|e| e.ok()) |
| 93 | + .filter(|e| e.file_type().is_some_and(|ft| ft.is_file())) |
| 94 | + .for_each(|entry| { |
| 95 | + let path = entry.path().to_path_buf(); |
| 96 | + let relative = pathdiff::diff_paths(&path, base_path).unwrap_or_else(|| path.clone()); |
| 97 | + let relative_path = relative.to_string_lossy().into_owned(); |
| 98 | + let size = entry.metadata().ok().map_or(0, |m| m.len()); |
| 99 | + let is_binary = detect_binary(&path, size); |
| 100 | + |
| 101 | + let path_string = path.to_string_lossy().into_owned(); |
| 102 | + let relative_start = (path_string.len() - relative_path.len()) as u16; |
| 103 | + let filename_start = path_string |
| 104 | + .rfind('/') |
| 105 | + .map(|i| i + 1) |
| 106 | + .unwrap_or(relative_start as usize) as u16; |
| 107 | + files.push(FileItem::new_raw( |
| 108 | + path_string, |
| 109 | + relative_start, |
| 110 | + filename_start, |
| 111 | + size, |
| 112 | + 0, |
| 113 | + None, |
| 114 | + is_binary, |
| 115 | + )); |
| 116 | + }); |
| 117 | + |
| 118 | + files |
| 119 | +} |
| 120 | + |
| 121 | +fn detect_binary(path: &Path, size: u64) -> bool { |
| 122 | + if size == 0 { |
| 123 | + return false; |
| 124 | + } |
| 125 | + let Ok(file) = std::fs::File::open(path) else { |
| 126 | + return false; |
| 127 | + }; |
| 128 | + let mut reader = std::io::BufReader::with_capacity(1024, file); |
| 129 | + let mut buf = [0u8; 512]; |
| 130 | + let n = reader.read(&mut buf).unwrap_or(0); |
| 131 | + buf[..n].contains(&0) |
| 132 | +} |
| 133 | + |
| 134 | +fn plain_options() -> GrepSearchOptions { |
| 135 | + GrepSearchOptions { |
| 136 | + max_file_size: 10 * 1024 * 1024, |
| 137 | + max_matches_per_file: 200, |
| 138 | + smart_case: true, |
| 139 | + file_offset: 0, |
| 140 | + page_limit: 50, |
| 141 | + mode: GrepMode::PlainText, |
| 142 | + time_budget_ms: 0, |
| 143 | + before_context: 0, |
| 144 | + after_context: 0, |
| 145 | + classify_definitions: false, |
| 146 | + trim_whitespace: false, |
| 147 | + } |
| 148 | +} |
| 149 | + |
| 150 | +fn fuzzy_options() -> GrepSearchOptions { |
| 151 | + GrepSearchOptions { |
| 152 | + mode: GrepMode::Fuzzy, |
| 153 | + ..plain_options() |
| 154 | + } |
| 155 | +} |
| 156 | + |
| 157 | +fn do_grep( |
| 158 | + files: &[FileItem], |
| 159 | + query: &str, |
| 160 | + options: &GrepSearchOptions, |
| 161 | + budget: &ContentCacheBudget, |
| 162 | + bigram: Option<&BigramFilter>, |
| 163 | +) -> usize { |
| 164 | + let parsed = grep::parse_grep_query(query); |
| 165 | + let result = grep::grep_search( |
| 166 | + black_box(files), |
| 167 | + black_box(&parsed), |
| 168 | + black_box(options), |
| 169 | + budget, |
| 170 | + bigram, |
| 171 | + None, |
| 172 | + None, |
| 173 | + ); |
| 174 | + result.matches.len() |
| 175 | +} |
| 176 | + |
| 177 | +fn bench_plain_warm(c: &mut Criterion) { |
| 178 | + let test_picker = setup(); |
| 179 | + let opts = plain_options(); |
| 180 | + |
| 181 | + let queries: &[(&str, &str)] = &[ |
| 182 | + ("2char_if", "if"), |
| 183 | + ("common_return", "return"), |
| 184 | + ("func_mutex_lock", "mutex_lock"), |
| 185 | + ("struct_inode_ops", "inode_operations"), |
| 186 | + ("define_MODULE_LICENSE", "MODULE_LICENSE"), |
| 187 | + ("rare_phylink_ethtool", "phylink_ethtool"), |
| 188 | + ("include", "#include"), |
| 189 | + ("comment_TODO", "TODO"), |
| 190 | + ("type_struct_file", "struct file"), |
| 191 | + ("error_EINVAL", "err = -EINVAL"), |
| 192 | + ("long_static_int_init", "static int __init"), |
| 193 | + ("very_common_int", "int"), |
| 194 | + ("single_char_x", "x"), |
| 195 | + ("path_printk_c", "printk *.c"), |
| 196 | + ("dir_mutex_kernel", "mutex /kernel/"), |
| 197 | + ]; |
| 198 | + |
| 199 | + let mut group = c.benchmark_group("plain_warm"); |
| 200 | + group.sample_size(30); |
| 201 | + group.warm_up_time(Duration::from_secs(2)); |
| 202 | + group.measurement_time(Duration::from_secs(5)); |
| 203 | + |
| 204 | + for (name, query) in queries { |
| 205 | + group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| { |
| 206 | + b.iter(|| do_grep(&test_picker.files, q, &opts, &test_picker.budget, None)) |
| 207 | + }); |
| 208 | + } |
| 209 | + |
| 210 | + group.finish(); |
| 211 | +} |
| 212 | + |
| 213 | +fn bench_bigram_warm(c: &mut Criterion) { |
| 214 | + let test_picker = setup(); |
| 215 | + let opts = plain_options(); |
| 216 | + |
| 217 | + let queries: &[(&str, &str)] = &[ |
| 218 | + ("2char_if", "if"), |
| 219 | + ("common_return", "return"), |
| 220 | + ("func_mutex_lock", "mutex_lock"), |
| 221 | + ("struct_inode_ops", "inode_operations"), |
| 222 | + ("define_MODULE_LICENSE", "MODULE_LICENSE"), |
| 223 | + ("rare_phylink_ethtool", "phylink_ethtool"), |
| 224 | + ("include", "#include"), |
| 225 | + ("comment_TODO", "TODO"), |
| 226 | + ("type_struct_file", "struct file"), |
| 227 | + ("error_EINVAL", "err = -EINVAL"), |
| 228 | + ("long_static_int_init", "static int __init"), |
| 229 | + ("very_common_int", "int"), |
| 230 | + ("single_char_x", "x"), |
| 231 | + ("path_printk_c", "printk *.c"), |
| 232 | + ("dir_mutex_kernel", "mutex /kernel/"), |
| 233 | + ]; |
| 234 | + |
| 235 | + let mut group = c.benchmark_group("bigram_warm"); |
| 236 | + group.sample_size(30); |
| 237 | + group.warm_up_time(Duration::from_secs(2)); |
| 238 | + group.measurement_time(Duration::from_secs(5)); |
| 239 | + |
| 240 | + for (name, query) in queries { |
| 241 | + group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| { |
| 242 | + b.iter(|| { |
| 243 | + do_grep( |
| 244 | + &test_picker.files, |
| 245 | + q, |
| 246 | + &opts, |
| 247 | + &test_picker.budget, |
| 248 | + Some(&test_picker.bigram), |
| 249 | + ) |
| 250 | + }) |
| 251 | + }); |
| 252 | + } |
| 253 | + |
| 254 | + group.finish(); |
| 255 | +} |
| 256 | + |
| 257 | +fn bench_fuzzy_warm(c: &mut Criterion) { |
| 258 | + let test_picker = setup(); |
| 259 | + let opts = fuzzy_options(); |
| 260 | + |
| 261 | + let queries: &[(&str, &str)] = &[ |
| 262 | + ("exact_mutex_lock", "mutex_lock"), |
| 263 | + ("typo_mutx_lock", "mutx_lock"), |
| 264 | + ("camel_InodeOps", "InodeOps"), |
| 265 | + ("abbrev_sched_rt", "sched_rt"), |
| 266 | + ("short_kfr", "kfr"), |
| 267 | + ("common_return", "return"), |
| 268 | + ("define_MODULE_LICENSE", "MODULE_LICENSE"), |
| 269 | + ("struct_file_ops", "file_operations"), |
| 270 | + ("long_static_int_init", "static_int_init"), |
| 271 | + ("path_printk_c", "printk *.c"), |
| 272 | + ]; |
| 273 | + |
| 274 | + let mut group = c.benchmark_group("fuzzy_warm"); |
| 275 | + group.sample_size(10); |
| 276 | + group.warm_up_time(Duration::from_secs(2)); |
| 277 | + group.measurement_time(Duration::from_secs(8)); |
| 278 | + |
| 279 | + for (name, query) in queries { |
| 280 | + group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| { |
| 281 | + b.iter(|| do_grep(&test_picker.files, q, &opts, &test_picker.budget, None)) |
| 282 | + }); |
| 283 | + } |
| 284 | + |
| 285 | + group.finish(); |
| 286 | +} |
| 287 | + |
| 288 | +fn bench_fuzzy_bigram_warm(c: &mut Criterion) { |
| 289 | + let test_picker = setup(); |
| 290 | + let opts = fuzzy_options(); |
| 291 | + |
| 292 | + let queries: &[(&str, &str)] = &[ |
| 293 | + ("exact_mutex_lock", "mutex_lock"), |
| 294 | + ("typo_mutx_lock", "mutx_lock"), |
| 295 | + ("camel_InodeOps", "InodeOps"), |
| 296 | + ("abbrev_sched_rt", "sched_rt"), |
| 297 | + ("short_kfr", "kfr"), |
| 298 | + ("common_return", "return"), |
| 299 | + ("define_MODULE_LICENSE", "MODULE_LICENSE"), |
| 300 | + ("struct_file_ops", "file_operations"), |
| 301 | + ("long_static_int_init", "static_int_init"), |
| 302 | + ("path_printk_c", "printk *.c"), |
| 303 | + ]; |
| 304 | + |
| 305 | + let mut group = c.benchmark_group("fuzzy_bigram_warm"); |
| 306 | + group.sample_size(10); |
| 307 | + group.warm_up_time(Duration::from_secs(2)); |
| 308 | + group.measurement_time(Duration::from_secs(8)); |
| 309 | + |
| 310 | + for (name, query) in queries { |
| 311 | + group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| { |
| 312 | + b.iter(|| { |
| 313 | + do_grep( |
| 314 | + &test_picker.files, |
| 315 | + q, |
| 316 | + &opts, |
| 317 | + &test_picker.budget, |
| 318 | + Some(&test_picker.bigram), |
| 319 | + ) |
| 320 | + }) |
| 321 | + }); |
| 322 | + } |
| 323 | + |
| 324 | + group.finish(); |
| 325 | +} |
| 326 | + |
| 327 | +fn bench_plain_cold(c: &mut Criterion) { |
| 328 | + let test_picker = setup(); |
| 329 | + let opts = plain_options(); |
| 330 | + |
| 331 | + let queries: &[(&str, &str)] = &[ |
| 332 | + ("2char_if", "if"), |
| 333 | + ("common_return", "return"), |
| 334 | + ("func_mutex_lock", "mutex_lock"), |
| 335 | + ("struct_inode_ops", "inode_operations"), |
| 336 | + ("define_MODULE_LICENSE", "MODULE_LICENSE"), |
| 337 | + ("rare_phylink_ethtool", "phylink_ethtool"), |
| 338 | + ("long_static_int_init", "static int __init"), |
| 339 | + ]; |
| 340 | + |
| 341 | + let mut group = c.benchmark_group("plain_cold"); |
| 342 | + group.sample_size(10); |
| 343 | + group.warm_up_time(Duration::from_millis(500)); |
| 344 | + group.measurement_time(Duration::from_secs(10)); |
| 345 | + |
| 346 | + let canonical = fff::path_utils::canonicalize(&big_repo_path()).expect("canonicalize"); |
| 347 | + |
| 348 | + for (name, query) in queries { |
| 349 | + group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| { |
| 350 | + b.iter_with_setup( |
| 351 | + || load_files(&canonical), |
| 352 | + |fresh_files| do_grep(&fresh_files, q, &opts, &test_picker.budget, None), |
| 353 | + ); |
| 354 | + }); |
| 355 | + } |
| 356 | + |
| 357 | + group.finish(); |
| 358 | +} |
| 359 | + |
| 360 | +criterion_group!( |
| 361 | + benches, |
| 362 | + bench_plain_warm, |
| 363 | + bench_bigram_warm, |
| 364 | + bench_fuzzy_warm, |
| 365 | + bench_fuzzy_bigram_warm, |
| 366 | + bench_plain_cold, |
| 367 | +); |
| 368 | + |
| 369 | +criterion_main!(benches); |
0 commit comments