@@ -184,42 +184,54 @@ impl DiscoveryStrategy for BM25Discovery {
184184 if query_tokens. is_empty ( ) {
185185 return Vec :: new ( ) ;
186186 }
187+ let query_set: FxHashSet < String > = query_tokens. into_iter ( ) . collect ( ) ;
187188
188189 let changed_set: FxHashSet < & Path > = ctx. changed_files . iter ( ) . map ( |p| p. as_path ( ) ) . collect ( ) ;
189- let mut corpus: Vec < Vec < String > > = Vec :: new ( ) ;
190- let mut paths: Vec < PathBuf > = Vec :: new ( ) ;
191190
192- for f in & ctx. all_candidates {
193- if changed_set. contains ( f. as_path ( ) ) {
194- continue ;
195- }
196- let content = match ctx. read_file ( f) {
197- Some ( c) => c,
198- None => continue ,
199- } ;
200- corpus. push ( extract_identifier_list (
201- & content,
202- BM25 . min_query_token_length ,
203- ) ) ;
204- paths. push ( f. clone ( ) ) ;
205- }
191+ // Parallel tokenization: previously a serial loop, the dominant
192+ // cost on mega-repos (vscode/mui ~5k TS files). par_iter saturates
193+ // available rayon threads.
194+ let pairs: Vec < ( PathBuf , Vec < String > ) > = ctx
195+ . all_candidates
196+ . par_iter ( )
197+ . filter ( |f| !changed_set. contains ( f. as_path ( ) ) )
198+ . filter_map ( |f| {
199+ let content = ctx. read_file ( f) ?;
200+ Some ( (
201+ f. clone ( ) ,
202+ extract_identifier_list ( & content, BM25 . min_query_token_length ) ,
203+ ) )
204+ } )
205+ . collect ( ) ;
206206
207- if corpus . is_empty ( ) {
207+ if pairs . is_empty ( ) {
208208 return Vec :: new ( ) ;
209209 }
210+ let n_docs = pairs. len ( ) ;
211+ if n_docs > 5000 {
212+ tracing:: warn!(
213+ "BM25Discovery: large candidate corpus ({n_docs} docs) — using inverted-index fast path"
214+ ) ;
215+ }
210216
211- let n_docs = corpus . len ( ) ;
212- let avgdl = corpus . iter ( ) . map ( |d| d . len ( ) ) . sum :: < usize > ( ) as f64 / n_docs as f64 ;
213-
217+ // Single pass: compute df globally + inverted-index posting lists
218+ // for query terms only (skip indexing terms not in the query — they
219+ // are never needed and would balloon memory on large repos).
214220 let mut df: FxHashMap < String , usize > = FxHashMap :: default ( ) ;
215- for doc in & corpus {
221+ let mut postings: FxHashMap < String , Vec < usize > > = FxHashMap :: default ( ) ;
222+ let mut total_len: usize = 0 ;
223+ for ( doc_id, ( _, doc) ) in pairs. iter ( ) . enumerate ( ) {
224+ total_len += doc. len ( ) ;
216225 let unique: FxHashSet < & str > = doc. iter ( ) . map ( |s| s. as_str ( ) ) . collect ( ) ;
217226 for term in unique {
218227 * df. entry ( term. to_string ( ) ) . or_insert ( 0 ) += 1 ;
228+ if query_set. contains ( term) {
229+ postings. entry ( term. to_string ( ) ) . or_default ( ) . push ( doc_id) ;
230+ }
219231 }
220232 }
233+ let avgdl = total_len as f64 / n_docs as f64 ;
221234
222- let query_set: FxHashSet < String > = query_tokens. into_iter ( ) . collect ( ) ;
223235 let idf: FxHashMap < String , f64 > = query_set
224236 . iter ( )
225237 . map ( |t| {
@@ -230,23 +242,37 @@ impl DiscoveryStrategy for BM25Discovery {
230242 } )
231243 . collect ( ) ;
232244
233- let scores: Vec < f64 > = corpus
245+ // Candidate doc-ids = union of posting lists for query terms. Docs
246+ // not in this set contain zero query terms and would score 0 — skip
247+ // them. This is the algorithmic win: scoring shrinks from O(N_docs)
248+ // to O(|posting-list union|), typically ~10-100× smaller on big
249+ // corpora where the query is sparse against the corpus vocabulary.
250+ let mut candidate_ids: FxHashSet < usize > = FxHashSet :: default ( ) ;
251+ for term in & query_set {
252+ if let Some ( p) = postings. get ( term) {
253+ candidate_ids. extend ( p) ;
254+ }
255+ }
256+ if candidate_ids. is_empty ( ) {
257+ return Vec :: new ( ) ;
258+ }
259+
260+ let candidate_vec: Vec < usize > = candidate_ids. into_iter ( ) . collect ( ) ;
261+ let scored: Vec < ( usize , f64 ) > = candidate_vec
234262 . par_iter ( )
235- . map ( |doc| Self :: bm25_score ( doc, & query_set, & idf, avgdl) )
263+ . map ( |& doc_id| {
264+ let s = Self :: bm25_score ( & pairs[ doc_id] . 1 , & query_set, & idf, avgdl) ;
265+ ( doc_id, s)
266+ } )
236267 . collect ( ) ;
237268
238- let mut ranked: Vec < usize > = ( 0 ..scores. len ( ) ) . collect ( ) ;
239- ranked. sort_by ( |& a, & b| {
240- scores[ b]
241- . partial_cmp ( & scores[ a] )
242- . unwrap_or ( std:: cmp:: Ordering :: Equal )
243- } ) ;
269+ let mut ranked: Vec < ( usize , f64 ) > = scored. into_iter ( ) . filter ( |( _, s) | * s > 0.0 ) . collect ( ) ;
270+ ranked. sort_by ( |a, b| b. 1 . partial_cmp ( & a. 1 ) . unwrap_or ( std:: cmp:: Ordering :: Equal ) ) ;
244271
245272 ranked
246273 . into_iter ( )
247274 . take ( self . top_k )
248- . filter ( |& i| scores[ i] > 0.0 )
249- . map ( |i| paths[ i] . clone ( ) )
275+ . map ( |( i, _) | pairs[ i] . 0 . clone ( ) )
250276 . collect ( )
251277 }
252278}
0 commit comments