@@ -63,74 +63,34 @@ fn clamp_lexical_weight(raw_sim: f64, src_path: Option<&Path>, dst_path: Option<
6363
6464pub struct LexicalEdgeBuilder ;
6565
66- impl LexicalEdgeBuilder {
67- fn compute_doc_frequencies ( & self , fragments : & [ Fragment ] ) -> FxHashMap < String , usize > {
68- let mut doc_freq: FxHashMap < String , usize > = FxHashMap :: default ( ) ;
69- for frag in fragments {
70- let profile = profile_from_path ( frag. path ( ) ) ;
71- let idents = extract_identifier_list ( & frag. content , 3 ) ;
72- let filtered = filter_idents ( & idents, 3 , profile) ;
73- let mut seen: FxHashSet < String > = FxHashSet :: default ( ) ;
74- for ident in filtered {
75- if seen. insert ( ident. clone ( ) ) {
76- * doc_freq. entry ( ident) . or_insert ( 0 ) += 1 ;
77- }
78- }
66+ /// Maps each unique term to a compact u32 id. Stores each term string exactly once.
67+ struct TermInterner {
68+ by_str : FxHashMap < String , u32 > ,
69+ }
70+
71+ impl TermInterner {
72+ fn new ( ) -> Self {
73+ Self {
74+ by_str : FxHashMap :: default ( ) ,
7975 }
80- doc_freq
8176 }
8277
83- fn compute_idf (
84- & self ,
85- doc_freq : & FxHashMap < String , usize > ,
86- n_docs : usize ,
87- ) -> FxHashMap < String , f64 > {
88- doc_freq
89- . iter ( )
90- . map ( |( term, & df) | {
91- let idf = ( ( n_docs as f64 + 1.0 ) / ( df as f64 + 1.0 ) ) . ln ( ) + 1.0 ;
92- ( term. clone ( ) , idf)
93- } )
94- . collect ( )
78+ fn intern ( & mut self , term : String ) -> u32 {
79+ let next_id = self . by_str . len ( ) as u32 ;
80+ * self . by_str . entry ( term) . or_insert ( next_id)
81+ }
82+
83+ fn len ( & self ) -> usize {
84+ self . by_str . len ( )
9585 }
86+ }
9687
97- fn build_tf_idf_vector (
98- & self ,
99- frag : & Fragment ,
100- doc_freq : & FxHashMap < String , usize > ,
101- idf : & FxHashMap < String , f64 > ,
102- max_df : usize ,
103- ) -> FxHashMap < String , f64 > {
88+ impl LexicalEdgeBuilder {
89+ /// Tokenize and filter identifiers for one fragment. Returns the raw filtered identifier list.
90+ fn tokens ( frag : & Fragment ) -> Vec < String > {
10491 let profile = profile_from_path ( frag. path ( ) ) ;
10592 let idents = extract_identifier_list ( & frag. content , 3 ) ;
106- let filtered = filter_idents ( & idents, 3 , profile) ;
107-
108- let mut tf: FxHashMap < String , usize > = FxHashMap :: default ( ) ;
109- for ident in filtered {
110- * tf. entry ( ident) . or_insert ( 0 ) += 1 ;
111- }
112-
113- let mut vec: FxHashMap < String , f64 > = FxHashMap :: default ( ) ;
114- for ( term, & count) in & tf {
115- let df = doc_freq. get ( term) . copied ( ) . unwrap_or ( 0 ) ;
116- if df == 0 || df > max_df {
117- continue ;
118- }
119- let term_idf = idf. get ( term) . copied ( ) . unwrap_or ( 1.0 ) ;
120- if term_idf < LEXICAL . min_idf {
121- continue ;
122- }
123- vec. insert ( term. clone ( ) , count as f64 * term_idf) ;
124- }
125-
126- let norm: f64 = vec. values ( ) . map ( |v| v * v) . sum :: < f64 > ( ) . sqrt ( ) ;
127- if norm > 0.0 {
128- for v in vec. values_mut ( ) {
129- * v /= norm;
130- }
131- }
132-
133- vec
93+ filter_idents ( & idents, 3 , profile)
13494 }
13595}
13696
@@ -140,81 +100,155 @@ impl EdgeBuilder for LexicalEdgeBuilder {
140100 return FxHashMap :: default ( ) ;
141101 }
142102
143- let doc_freq = self . compute_doc_frequencies ( fragments) ;
144103 let n_docs = fragments. len ( ) ;
145104 let max_df = ( n_docs as f64 * LEXICAL . max_df_ratio ) . max ( 1.0 ) as usize ;
146- let idf = self . compute_idf ( & doc_freq, n_docs) ;
147105
148- let tf_idf_vectors: FxHashMap < FragmentId , FxHashMap < String , f64 > > = fragments
106+ // Pass 1: tokenize each fragment in parallel; flatten to per-fragment Vec<String>.
107+ let per_frag_tokens: Vec < Vec < String > > =
108+ fragments. par_iter ( ) . map ( |f| Self :: tokens ( f) ) . collect ( ) ;
109+
110+ // Pass 2: build the term interner serially, computing document frequency in one go.
111+ let mut interner = TermInterner :: new ( ) ;
112+ let mut doc_freq: Vec < u32 > = Vec :: new ( ) ;
113+ let per_frag_term_ids: Vec < Vec < u32 > > = per_frag_tokens
114+ . into_iter ( )
115+ . map ( |tokens| {
116+ let mut seen_in_doc: FxHashSet < u32 > = FxHashSet :: default ( ) ;
117+ let mut ids: Vec < u32 > = Vec :: with_capacity ( tokens. len ( ) ) ;
118+ for tok in tokens {
119+ let id = interner. intern ( tok) ;
120+ if doc_freq. len ( ) <= id as usize {
121+ doc_freq. resize ( id as usize + 1 , 0 ) ;
122+ }
123+ if seen_in_doc. insert ( id) {
124+ doc_freq[ id as usize ] += 1 ;
125+ }
126+ ids. push ( id) ;
127+ }
128+ ids
129+ } )
130+ . collect ( ) ;
131+
132+ let n_terms = interner. len ( ) ;
133+ // Interner string-table is no longer needed once doc_freq has been built.
134+ drop ( interner) ;
135+
136+ let n_docs_f = n_docs as f64 ;
137+ let mut idf: Vec < f32 > = Vec :: with_capacity ( n_terms) ;
138+ for & df in & doc_freq {
139+ let v = ( ( n_docs_f + 1.0 ) / ( df as f64 + 1.0 ) ) . ln ( ) + 1.0 ;
140+ idf. push ( v as f32 ) ;
141+ }
142+
143+ // Pass 3: build TF-IDF vectors as sparse Vec<(TermId, f32)>, normalized.
144+ let tf_idf: Vec < Vec < ( u32 , f32 ) > > = per_frag_term_ids
149145 . par_iter ( )
150- . map ( |frag| {
151- let vec = self . build_tf_idf_vector ( frag, & doc_freq, & idf, max_df) ;
152- ( frag. id . clone ( ) , vec)
146+ . map ( |term_ids| {
147+ let mut tf: FxHashMap < u32 , u32 > = FxHashMap :: default ( ) ;
148+ for & id in term_ids {
149+ * tf. entry ( id) . or_insert ( 0 ) += 1 ;
150+ }
151+ let mut vec: Vec < ( u32 , f32 ) > = Vec :: with_capacity ( tf. len ( ) ) ;
152+ for ( & term_id, & count) in & tf {
153+ let df = doc_freq[ term_id as usize ] as usize ;
154+ if df == 0 || df > max_df {
155+ continue ;
156+ }
157+ let term_idf = idf[ term_id as usize ] ;
158+ if ( term_idf as f64 ) < LEXICAL . min_idf {
159+ continue ;
160+ }
161+ vec. push ( ( term_id, count as f32 * term_idf) ) ;
162+ }
163+ let norm: f32 = vec. iter ( ) . map ( |( _, w) | w * w) . sum :: < f32 > ( ) . sqrt ( ) ;
164+ if norm > 0.0 {
165+ for ( _, w) in & mut vec {
166+ * w /= norm;
167+ }
168+ }
169+ vec. sort_unstable_by_key ( |& ( id, _) | id) ;
170+ vec
153171 } )
154172 . collect ( ) ;
155173
156- let mut postings: FxHashMap < String , Vec < ( FragmentId , f64 ) > > = FxHashMap :: default ( ) ;
157- for ( frag_id, vec) in & tf_idf_vectors {
158- for ( term, & weight) in vec {
159- postings
160- . entry ( term. clone ( ) )
161- . or_default ( )
162- . push ( ( frag_id. clone ( ) , weight) ) ;
174+ drop ( per_frag_term_ids) ;
175+ drop ( doc_freq) ;
176+ drop ( idf) ;
177+
178+ // Pass 4: invert into postings — for each term, list of (frag_idx, weight).
179+ // Consume tf_idf as we go so it never coexists with the inverted index.
180+ let mut postings: Vec < Vec < ( u32 , f32 ) > > = vec ! [ Vec :: new( ) ; n_terms] ;
181+ for ( frag_idx, vec) in tf_idf. into_iter ( ) . enumerate ( ) {
182+ for ( term_id, weight) in vec {
183+ postings[ term_id as usize ] . push ( ( frag_idx as u32 , weight) ) ;
163184 }
164185 }
165186
166- let mut dot_products: FxHashMap < ( FragmentId , FragmentId ) , f64 > = FxHashMap :: default ( ) ;
167- for ( _term, posting_list) in & postings {
168- if posting_list. len ( ) > LEXICAL . max_postings {
187+ // Pass 5: O(F²) inner loop over each posting, capped by max_postings.
188+ // Drop each posting list as soon as we are done with it.
189+ let mut dot_products: FxHashMap < ( u32 , u32 ) , f32 > = FxHashMap :: default ( ) ;
190+ for posting_list in postings. iter_mut ( ) {
191+ if posting_list. len ( ) > LEXICAL . max_postings || posting_list. len ( ) < 2 {
192+ posting_list. clear ( ) ;
193+ posting_list. shrink_to_fit ( ) ;
169194 continue ;
170195 }
171196 for i in 0 ..posting_list. len ( ) {
172- let ( ref frag_i, weight_i) = posting_list[ i] ;
197+ let ( frag_i, weight_i) = posting_list[ i] ;
173198 for j in ( i + 1 ) ..posting_list. len ( ) {
174- let ( ref frag_j, weight_j) = posting_list[ j] ;
175- let pair = if frag_i. to_string ( ) < frag_j. to_string ( ) {
176- ( frag_i. clone ( ) , frag_j. clone ( ) )
199+ let ( frag_j, weight_j) = posting_list[ j] ;
200+ let pair = if frag_i < frag_j {
201+ ( frag_i, frag_j)
177202 } else {
178- ( frag_j. clone ( ) , frag_i. clone ( ) )
203+ ( frag_j, frag_i)
179204 } ;
180205 * dot_products. entry ( pair) . or_insert ( 0.0 ) += weight_i * weight_j;
181206 }
182207 }
208+ posting_list. clear ( ) ;
209+ posting_list. shrink_to_fit ( ) ;
183210 }
211+ drop ( postings) ;
184212
185- let id_to_path: FxHashMap < FragmentId , & str > =
186- fragments. iter ( ) . map ( |f| ( f. id . clone ( ) , f. path ( ) ) ) . collect ( ) ;
187-
188- let mut neighbors_by_node: FxHashMap < FragmentId , Vec < ( f64 , FragmentId ) > > =
189- FxHashMap :: default ( ) ;
213+ // Pass 6: turn pairwise similarities into per-node top-k candidate edges.
214+ let frag_paths: Vec < & str > = fragments. iter ( ) . map ( |f| f. path ( ) ) . collect ( ) ;
215+ let mut neighbors_by_node: FxHashMap < u32 , Vec < ( f32 , u32 ) > > = FxHashMap :: default ( ) ;
190216
191- for ( ( src, dst) , sim) in & dot_products {
192- if * sim < LEXICAL . min_similarity {
217+ let min_sim = LEXICAL . min_similarity as f32 ;
218+ let backward_factor = LEXICAL . backward_factor as f32 ;
219+ for ( ( src_idx, dst_idx) , sim) in & dot_products {
220+ if * sim < min_sim {
193221 continue ;
194222 }
195- let src_path = id_to_path. get ( src) . map ( |s| Path :: new ( * s) ) ;
196- let dst_path = id_to_path. get ( dst) . map ( |s| Path :: new ( * s) ) ;
197- let fwd = clamp_lexical_weight ( * sim, src_path, dst_path) ;
198- let bwd = clamp_lexical_weight ( * sim, dst_path, src_path) * LEXICAL . backward_factor ;
223+ let src_path = Path :: new ( frag_paths[ * src_idx as usize ] ) ;
224+ let dst_path = Path :: new ( frag_paths[ * dst_idx as usize ] ) ;
225+ let fwd = clamp_lexical_weight ( * sim as f64 , Some ( src_path) , Some ( dst_path) ) as f32 ;
226+ let bwd = clamp_lexical_weight ( * sim as f64 , Some ( dst_path) , Some ( src_path) ) as f32
227+ * backward_factor;
199228 neighbors_by_node
200- . entry ( src . clone ( ) )
229+ . entry ( * src_idx )
201230 . or_default ( )
202- . push ( ( fwd, dst . clone ( ) ) ) ;
231+ . push ( ( fwd, * dst_idx ) ) ;
203232 neighbors_by_node
204- . entry ( dst . clone ( ) )
233+ . entry ( * dst_idx )
205234 . or_default ( )
206- . push ( ( bwd, src . clone ( ) ) ) ;
235+ . push ( ( bwd, * src_idx ) ) ;
207236 }
208237
238+ let frag_ids: Vec < & FragmentId > = fragments. iter ( ) . map ( |f| & f. id ) . collect ( ) ;
209239 let mut edges: EdgeDict = FxHashMap :: default ( ) ;
210- for ( _node , mut candidates) in neighbors_by_node {
240+ for ( node_idx , mut candidates) in neighbors_by_node {
211241 candidates. sort_by ( |a, b| b. 0 . partial_cmp ( & a. 0 ) . unwrap_or ( std:: cmp:: Ordering :: Equal ) ) ;
212242 candidates. truncate ( LEXICAL . top_k_neighbors ) ;
213- for ( weight, neighbor) in candidates {
214- let key = ( _node. clone ( ) , neighbor) ;
243+ for ( weight, neighbor_idx) in candidates {
244+ let key = (
245+ frag_ids[ node_idx as usize ] . clone ( ) ,
246+ frag_ids[ neighbor_idx as usize ] . clone ( ) ,
247+ ) ;
215248 let existing = edges. get ( & key) . copied ( ) . unwrap_or ( 0.0 ) ;
216- if weight > existing {
217- edges. insert ( key, weight) ;
249+ let weight_f64 = weight as f64 ;
250+ if weight_f64 > existing {
251+ edges. insert ( key, weight_f64) ;
218252 }
219253 }
220254 }
0 commit comments