Skip to content

Commit 998db37

Browse files
committed
chore: Update docs for - wip: try to get rid of arcs
1 parent 5f9ce99 commit 998db37

10 files changed

Lines changed: 436 additions & 585 deletions

File tree

crates/fff-core/src/bigram_filter.rs

Lines changed: 85 additions & 282 deletions
Large diffs are not rendered by default.

crates/fff-core/src/file_picker.rs

Lines changed: 51 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ use std::ops::ControlFlow;
5656
use std::path::{Path, PathBuf};
5757
use std::sync::{
5858
Arc, LazyLock,
59-
atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering},
59+
atomic::{AtomicBool, AtomicUsize, Ordering},
6060
};
6161
use std::thread::JoinHandle;
6262
use std::time::SystemTime;
@@ -137,7 +137,7 @@ pub(crate) struct FileSync {
137137
indexable_count: usize,
138138
base_count: usize,
139139
/// Number of active present files that exists in the file system
140-
live_count: usize,
140+
pub(crate) live_count: usize,
141141
/// Sorted directory table. `StableVec` so post-scan snapshots can keep
142142
/// the allocation alive across a picker drop without copying, and so
143143
/// concurrent readers observe a consistent view via the same shared
@@ -752,24 +752,12 @@ impl FilePicker {
752752

753753
{
754754
let mut guard = shared_picker.write()?;
755-
// If the old picker has a post-scan in flight, wait for it to
756-
// finish. cancel() was already called so the rayon loop exits
757-
// within microseconds (each worker checks cancelled per item).
758-
if let Some(ref old_picker) = *guard {
759-
let flag = Arc::clone(&old_picker.signals.post_scan_indexing_active);
760-
drop(guard);
761-
while flag.load(Ordering::Acquire) {
762-
std::thread::sleep(std::time::Duration::from_millis(1));
763-
}
764-
guard = shared_picker.write()?;
765-
}
766755
*guard = Some(picker);
756+
// by dropping the old picker if it exists we triggering
757+
// it's internal `cancelled` flag flip which will automatically clean
758+
// any thread that might be capturing the reference safely & unsfaely
767759
}
768760

769-
// `ScanJob::spawn` flips `scanning=true` synchronously before handing
770-
// off to the worker thread, so callers that invoke `wait_for_scan`
771-
// immediately after `new_with_shared_state` are guaranteed to see
772-
// the scan in progress.
773761
ScanJob::new_initial(
774762
shared_picker,
775763
shared_frecency,
@@ -1283,18 +1271,15 @@ impl FilePicker {
12831271
if self
12841272
.signals
12851273
.post_scan_indexing_active
1286-
.load(Ordering::Acquire)
1274+
.compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
1275+
.is_err()
12871276
{
12881277
tracing::error!(
12891278
"Can not acquire post scan unsafe snapshot, someone already acquired it"
12901279
);
12911280
return None;
12921281
}
12931282

1294-
self.signals
1295-
.post_scan_indexing_active
1296-
.store(true, Ordering::Release);
1297-
12981283
Some(PostScanUnsafeSnapshot {
12991284
files: self.sync_data.files.clone(),
13001285
dirs: self.sync_data.dirs.clone(),
@@ -1654,6 +1639,14 @@ fn canonical_relative_path(path: &Path, base: &Path) -> Option<String> {
16541639
rel.to_str().map(str::to_owned)
16551640
}
16561641

1642+
impl Drop for FilePicker {
1643+
fn drop(&mut self) {
1644+
// Cancel any in-flight ScanJob bound to this picker's signals so
1645+
// it cannot mutate the replacement picker after a swap.
1646+
self.signals.cancelled.store(true, Ordering::Release);
1647+
}
1648+
}
1649+
16571650
#[derive(Debug, Clone, Copy)]
16581651
enum FileSlot {
16591652
Base(usize),
@@ -1715,76 +1708,6 @@ pub struct ScanProgress {
17151708
pub is_warmup_complete: bool,
17161709
}
17171710

1718-
/// Pre-populate mmap caches for the most valuable files so the first grep
1719-
/// search doesn't pay the mmap creation + page fault cost.
1720-
///
1721-
/// All files are collected once, then an O(n) `select_nth_unstable_by`
1722-
/// partitions the top [`MAX_CACHED_CONTENT_FILES`] highest-frecency eligible
1723-
/// files to the front (binary / empty files are pushed to the end by the
1724-
/// comparator). The selected prefix is warmed in parallel via rayon.
1725-
///
1726-
/// Files beyond the budget are still available via temporary mmaps on first
1727-
/// grep access, so correctness is unaffected.
1728-
#[tracing::instrument(skip(files), name = "warmup_mmaps", level = Level::DEBUG)]
1729-
pub(crate) fn warmup_mmaps(
1730-
files: &[FileItem],
1731-
budget: &ContentCacheBudget,
1732-
base_path: &Path,
1733-
arena: ArenaPtr,
1734-
) {
1735-
let max_files = budget.max_files;
1736-
let max_bytes = budget.max_bytes;
1737-
let max_file_size = budget.max_file_size;
1738-
1739-
// Single collect — no pre-filter. The comparator in select_nth pushes
1740-
// ineligible files (binary, empty) to the tail automatically.
1741-
let mut all: Vec<&FileItem> = files.iter().collect();
1742-
1743-
// O(n) partial sort: top max_files eligible-by-frecency files land in
1744-
// all[..max_files]. Ineligible files compare as "lowest priority" so
1745-
// they naturally sink past the partition boundary.
1746-
if all.len() > max_files {
1747-
all.select_nth_unstable_by(max_files, |a, b| {
1748-
let a_ok = !a.is_binary() && a.size > 0;
1749-
let b_ok = !b.is_binary() && b.size > 0;
1750-
match (a_ok, b_ok) {
1751-
(true, false) => std::cmp::Ordering::Less,
1752-
(false, true) => std::cmp::Ordering::Greater,
1753-
(false, false) => std::cmp::Ordering::Equal,
1754-
(true, true) => b.total_frecency_score().cmp(&a.total_frecency_score()),
1755-
}
1756-
});
1757-
}
1758-
1759-
let to_warm = &all[..all.len().min(max_files)];
1760-
1761-
let warmed_bytes = AtomicU64::new(0);
1762-
let budget_exhausted = AtomicBool::new(false);
1763-
1764-
BACKGROUND_THREAD_POOL.install(|| {
1765-
to_warm.par_iter().for_each(|file| {
1766-
if budget_exhausted.load(Ordering::Relaxed) {
1767-
return;
1768-
}
1769-
1770-
if file.is_binary() || file.size == 0 || file.size > max_file_size {
1771-
return;
1772-
}
1773-
1774-
// Byte budget.
1775-
let prev_bytes = warmed_bytes.fetch_add(file.size, Ordering::Relaxed);
1776-
if prev_bytes + file.size > max_bytes {
1777-
budget_exhausted.store(true, Ordering::Relaxed);
1778-
return;
1779-
}
1780-
1781-
if let Some(content) = file.get_content(arena, base_path, budget) {
1782-
let _ = std::hint::black_box(content.first());
1783-
}
1784-
});
1785-
});
1786-
}
1787-
17881711
impl FileSync {
17891712
pub(crate) fn discover_git_workdir(base_path: &Path) -> Option<PathBuf> {
17901713
let git_workdir = Repository::discover(base_path)
@@ -1941,12 +1864,15 @@ impl FileSync {
19411864
// (one per partition) to preserve O(log n) lookups.
19421865
//
19431866
// "Indexable" = can possibly contribute bigrams: not binary-by-extension,
1944-
// non-zero size, not larger than the bigram/mmap cap. The cap matches
1945-
// `ContentCacheBudget::max_file_size` default (10 MB) — any file above
1946-
// that is skipped by `build_bigram_index` anyway.
1947-
const BIGRAM_ELIGIBLE_MAX_SIZE: u64 = 10 * 1024 * 1024;
1948-
let is_indexable =
1949-
|f: &FileItem| !f.is_binary() && f.size > 0 && f.size <= BIGRAM_ELIGIBLE_MAX_SIZE;
1867+
// non-zero size, not larger than `BIGRAM_CONTENT_CAP`. Capping indexable
1868+
// size at the bigram scan window means every indexed file is fully
1869+
// covered — no partial-content false negatives. Files above the cap
1870+
// land past `indexable_count` and are always scanned at grep time.
1871+
let is_indexable = |f: &FileItem| {
1872+
!f.is_binary()
1873+
&& f.size > 0
1874+
&& f.size <= crate::bigram_filter::BIGRAM_CONTENT_CAP as u64
1875+
};
19501876
BACKGROUND_THREAD_POOL.install(|| {
19511877
files.par_sort_unstable_by(|a, b| {
19521878
// Sort indexables first (true < false when we invert with !).
@@ -1999,6 +1925,33 @@ impl FileSync {
19991925
}
20001926
}
20011927

1928+
/// Pre-populate mmap caches for cold tail files so the first grep search
1929+
/// doesn't pay the mmap creation + page fault cost.
1930+
#[tracing::instrument(skip(files), name = "warmup_mmaps", level = Level::DEBUG)]
1931+
pub(crate) fn warmup_mmaps(
1932+
files: &[FileItem],
1933+
budget: &ContentCacheBudget,
1934+
base_path: &Path,
1935+
arena: ArenaPtr,
1936+
) {
1937+
// for most of the use cases mmaps limit would be signficantly smaller than arepo
1938+
for file in files.iter() {
1939+
if file.is_likely_hot()
1940+
|| file.is_binary()
1941+
|| file.size == 0
1942+
|| file.size > budget.max_file_size
1943+
{
1944+
continue;
1945+
}
1946+
1947+
let _ = file.get_cached_content(arena, base_path, budget);
1948+
1949+
if budget.is_exhausted() {
1950+
break;
1951+
}
1952+
}
1953+
}
1954+
20021955
/// This does both thing (yes sorry all the OOP morons)
20031956
/// in one go: populates files chunked storage and creates new directories
20041957
fn populates_dirs_files_chunked_storage<'a>(

crates/fff-core/src/grep.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1547,7 +1547,7 @@ fn fuzzy_grep_search<'a>(
15471547
abort_signal: &AtomicBool,
15481548
base_path: &Path,
15491549
arena: crate::simd_path::ArenaPtr,
1550-
_overflow_arena: crate::simd_path::ArenaPtr,
1550+
overflow_arena: crate::simd_path::ArenaPtr,
15511551
) -> GrepResult<'a> {
15521552
// max_typos controls how many *needle* characters can be unmatched.
15531553
// A transposition (e.g. "shcema" → "schema") costs ~1 typo with
@@ -1655,7 +1655,9 @@ fn fuzzy_grep_search<'a>(
16551655
return None;
16561656
}
16571657

1658-
let file_bytes = file.get_content_for_search(buf, arena, base_path, budget)?;
1658+
1659+
let file_arena = if file.is_overflow() { overflow_arena } else { arena };
1660+
let file_bytes = file.get_content_for_search(buf, file_arena, base_path, budget)?;
16591661

16601662
// File-level prefilter: check if enough distinct needle chars
16611663
// exist anywhere in the file bytes. Uses memchr for speed.

0 commit comments

Comments
 (0)