@@ -631,34 +631,76 @@ pub(crate) fn build_bigram_index(
631631 budget : & crate :: types:: ContentCacheBudget ,
632632 base_path : & std:: path:: Path ,
633633 arena : crate :: simd_path:: ArenaPtr ,
634+ warmup : bool ,
634635) -> ( BigramFilter , Vec < usize > ) {
635636 let start = std:: time:: Instant :: now ( ) ;
636- tracing:: info!( "Building bigram index for {} files..." , files. len( ) ) ;
637+ tracing:: info!(
638+ "Building bigram index for {} files (warmup={})" ,
639+ files. len( ) ,
640+ warmup,
641+ ) ;
637642
638643 let builder = BigramIndexBuilder :: new ( files. len ( ) ) ;
639644 let skip_builder = BigramIndexBuilder :: new ( files. len ( ) ) ;
640645
641- // this does remove a memcpy for every single file + actually reducing open time on macos
642646 #[ cfg( unix) ]
643647 let base_fd: libc:: c_int = open_base_dir_fd ( base_path) ;
644648 #[ cfg( not( unix) ) ]
645649 let base_fd: i32 = -1 ;
646650
647- // `content_binary` is only touched from the Binary branch below, so
648- // the mutex is cold in practice. A lock-free collector wasn't worth
649- // the complexity.
650651 let content_binary: std:: sync:: Mutex < Vec < usize > > = std:: sync:: Mutex :: new ( Vec :: new ( ) ) ;
651652
652- crate :: file_picker:: BACKGROUND_THREAD_POOL . install ( || {
653- files
654- . par_chunks ( BIGRAM_CHUNK_FILES )
655- . enumerate ( )
656- . for_each ( |( chunk_idx, chunk) | {
657- let base_idx = chunk_idx * BIGRAM_CHUNK_FILES ;
658- for ( offset, file) in chunk. iter ( ) . enumerate ( ) {
659- let file_idx = base_idx + offset;
653+ // When warmup is enabled, process high-frecency files first so they
654+ // fill the limited cache budget before lower-priority files consume it.
655+ // This replaces the separate warmup_mmaps pass with zero extra syscalls.
656+ if warmup {
657+ let max_files = budget. max_files ;
658+ // Partition indices: top `max_files` by frecency go first.
659+ let mut indices: Vec < usize > = ( 0 ..files. len ( ) ) . collect ( ) ;
660+ if indices. len ( ) > max_files {
661+ indices. select_nth_unstable_by ( max_files, |& a, & b| {
662+ let fa = & files[ a] ;
663+ let fb = & files[ b] ;
664+ let a_ok = !fa. is_binary ( ) && fa. size > 0 ;
665+ let b_ok = !fb. is_binary ( ) && fb. size > 0 ;
666+ match ( a_ok, b_ok) {
667+ ( true , false ) => std:: cmp:: Ordering :: Less ,
668+ ( false , true ) => std:: cmp:: Ordering :: Greater ,
669+ ( false , false ) => std:: cmp:: Ordering :: Equal ,
670+ ( true , true ) => fb. total_frecency_score ( ) . cmp ( & fa. total_frecency_score ( ) ) ,
671+ }
672+ } ) ;
673+ }
674+
675+ // Process priority files first (fills cache), then the rest.
676+ let priority_count = max_files. min ( indices. len ( ) ) ;
677+ let ( priority, rest) = indices. split_at ( priority_count) ;
678+
679+ crate :: file_picker:: BACKGROUND_THREAD_POOL . install ( || {
680+ // Phase 1: high-frecency files fill the cache budget.
681+ priority. par_chunks ( BIGRAM_CHUNK_FILES ) . for_each ( |chunk| {
682+ for & file_idx in chunk {
683+ let outcome = process_file (
684+ & files[ file_idx] ,
685+ file_idx,
686+ & builder,
687+ & skip_builder,
688+ base_fd,
689+ base_path,
690+ arena,
691+ budget,
692+ ) ;
693+ if matches ! ( outcome, FileOutcome :: Binary ) {
694+ content_binary. lock ( ) . unwrap ( ) . push ( file_idx) ;
695+ }
696+ }
697+ } ) ;
698+
699+ // Phase 2: remaining files (cache budget likely exhausted, uses openat).
700+ rest. par_chunks ( BIGRAM_CHUNK_FILES ) . for_each ( |chunk| {
701+ for & file_idx in chunk {
660702 let outcome = process_file (
661- file ,
703+ & files [ file_idx ] ,
662704 file_idx,
663705 & builder,
664706 & skip_builder,
@@ -672,12 +714,37 @@ pub(crate) fn build_bigram_index(
672714 }
673715 }
674716 } ) ;
675- } ) ;
717+ } ) ;
718+ } else {
719+ // No warmup: process in natural order (no cache priority needed).
720+ crate :: file_picker:: BACKGROUND_THREAD_POOL . install ( || {
721+ files
722+ . par_chunks ( BIGRAM_CHUNK_FILES )
723+ . enumerate ( )
724+ . for_each ( |( chunk_idx, chunk) | {
725+ let base_idx = chunk_idx * BIGRAM_CHUNK_FILES ;
726+ for ( offset, file) in chunk. iter ( ) . enumerate ( ) {
727+ let file_idx = base_idx + offset;
728+ let outcome = process_file (
729+ file,
730+ file_idx,
731+ & builder,
732+ & skip_builder,
733+ base_fd,
734+ base_path,
735+ arena,
736+ budget,
737+ ) ;
738+ if matches ! ( outcome, FileOutcome :: Binary ) {
739+ content_binary. lock ( ) . unwrap ( ) . push ( file_idx) ;
740+ }
741+ }
742+ } ) ;
743+ } ) ;
744+ }
676745
677746 #[ cfg( unix) ]
678747 if base_fd >= 0 {
679- // SAFETY: we opened `base_fd` at the top of this function and
680- // no worker still references it once the rayon pool joined.
681748 unsafe { libc:: close ( base_fd) } ;
682749 }
683750
@@ -688,9 +755,6 @@ pub(crate) fn build_bigram_index(
688755 let skip_index = skip_builder. compress ( Some ( SKIP_INDEX_MIN_DENSITY_PCT ) ) ;
689756 index. set_skip_index ( skip_index) ;
690757
691- // Builder buffers were freed by `compress()` above (one deallocation
692- // each); nudge mimalloc to return them (and any transient allocs)
693- // to the OS.
694758 crate :: file_picker:: hint_allocator_collect ( ) ;
695759
696760 tracing:: info!(
0 commit comments