Skip to content

Commit 5a71246

Browse files
committed
perf(diffctx): drop nested-hashmap graph, intern lexical terms, share content via Arc
- Graph::freeze() consumes fwd/rev hashmaps into CSR via mem::take so read-time storage is CSR-only. ego_graph and postpass now use CSR via for_each_forward_neighbor / forward_edge_weight. - LexicalEdgeBuilder uses a TermInterner + per-fragment Vec<u32> term ids and Vec-indexed postings, replacing per-fragment FxHashMap<String, f64> + cloned FragmentId tuples. Drops intermediates eagerly. - FragmentEntry/PyFragment store content as Option<Arc<str>>; render shares Arc instead of to_string() per selected fragment. - DiscoveryContext::read_file returns Cow<'_, str>; pipeline drops discovery_ctx (and its 200MB file_cache) right after discovery. - filter_unrelated_fragments takes &[Fragment]; BM25Scoring filters inline, eliminating all_fragments.to_vec() before each filter chain. - Use FragmentId Ord directly in lexical pair comparison instead of to_string() allocations.
1 parent 92f7044 commit 5a71246

10 files changed

Lines changed: 332 additions & 225 deletions

File tree

diffctx/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ tree-sitter-prisma-io = "1.6"
6363
tree-sitter-svelte-ng = "1.0"
6464
tiktoken-rs = "0.6"
6565
rayon = "1.10"
66-
serde = { version = "1", features = ["derive"] }
66+
serde = { version = "1", features = ["derive", "rc"] }
6767
serde_json = "1"
6868
serde_yaml = "0.9"
6969
thiserror = "2"

diffctx/src/discovery.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use std::borrow::Cow;
12
use std::collections::HashMap;
23
use std::path::{Path, PathBuf};
34

@@ -16,11 +17,11 @@ pub struct DiscoveryContext {
1617
}
1718

1819
impl DiscoveryContext {
19-
pub fn read_file(&self, path: &Path) -> Option<String> {
20+
pub fn read_file(&self, path: &Path) -> Option<Cow<'_, str>> {
2021
if let Some(content) = self.file_cache.get(path) {
21-
return Some(content.clone());
22+
return Some(Cow::Borrowed(content.as_str()));
2223
}
23-
std::fs::read_to_string(path).ok()
24+
std::fs::read_to_string(path).ok().map(Cow::Owned)
2425
}
2526
}
2627

diffctx/src/edges/similarity/lexical.rs

Lines changed: 136 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -63,74 +63,34 @@ fn clamp_lexical_weight(raw_sim: f64, src_path: Option<&Path>, dst_path: Option<
6363

6464
pub struct LexicalEdgeBuilder;
6565

66-
impl LexicalEdgeBuilder {
67-
fn compute_doc_frequencies(&self, fragments: &[Fragment]) -> FxHashMap<String, usize> {
68-
let mut doc_freq: FxHashMap<String, usize> = FxHashMap::default();
69-
for frag in fragments {
70-
let profile = profile_from_path(frag.path());
71-
let idents = extract_identifier_list(&frag.content, 3);
72-
let filtered = filter_idents(&idents, 3, profile);
73-
let mut seen: FxHashSet<String> = FxHashSet::default();
74-
for ident in filtered {
75-
if seen.insert(ident.clone()) {
76-
*doc_freq.entry(ident).or_insert(0) += 1;
77-
}
78-
}
66+
/// Maps each unique term to a compact u32 id. Stores each term string exactly once.
67+
struct TermInterner {
68+
by_str: FxHashMap<String, u32>,
69+
}
70+
71+
impl TermInterner {
72+
fn new() -> Self {
73+
Self {
74+
by_str: FxHashMap::default(),
7975
}
80-
doc_freq
8176
}
8277

83-
fn compute_idf(
84-
&self,
85-
doc_freq: &FxHashMap<String, usize>,
86-
n_docs: usize,
87-
) -> FxHashMap<String, f64> {
88-
doc_freq
89-
.iter()
90-
.map(|(term, &df)| {
91-
let idf = ((n_docs as f64 + 1.0) / (df as f64 + 1.0)).ln() + 1.0;
92-
(term.clone(), idf)
93-
})
94-
.collect()
78+
fn intern(&mut self, term: String) -> u32 {
79+
let next_id = self.by_str.len() as u32;
80+
*self.by_str.entry(term).or_insert(next_id)
81+
}
82+
83+
fn len(&self) -> usize {
84+
self.by_str.len()
9585
}
86+
}
9687

97-
fn build_tf_idf_vector(
98-
&self,
99-
frag: &Fragment,
100-
doc_freq: &FxHashMap<String, usize>,
101-
idf: &FxHashMap<String, f64>,
102-
max_df: usize,
103-
) -> FxHashMap<String, f64> {
88+
impl LexicalEdgeBuilder {
89+
/// Tokenize and filter identifiers for one fragment. Returns the raw filtered identifier list.
90+
fn tokens(frag: &Fragment) -> Vec<String> {
10491
let profile = profile_from_path(frag.path());
10592
let idents = extract_identifier_list(&frag.content, 3);
106-
let filtered = filter_idents(&idents, 3, profile);
107-
108-
let mut tf: FxHashMap<String, usize> = FxHashMap::default();
109-
for ident in filtered {
110-
*tf.entry(ident).or_insert(0) += 1;
111-
}
112-
113-
let mut vec: FxHashMap<String, f64> = FxHashMap::default();
114-
for (term, &count) in &tf {
115-
let df = doc_freq.get(term).copied().unwrap_or(0);
116-
if df == 0 || df > max_df {
117-
continue;
118-
}
119-
let term_idf = idf.get(term).copied().unwrap_or(1.0);
120-
if term_idf < LEXICAL.min_idf {
121-
continue;
122-
}
123-
vec.insert(term.clone(), count as f64 * term_idf);
124-
}
125-
126-
let norm: f64 = vec.values().map(|v| v * v).sum::<f64>().sqrt();
127-
if norm > 0.0 {
128-
for v in vec.values_mut() {
129-
*v /= norm;
130-
}
131-
}
132-
133-
vec
93+
filter_idents(&idents, 3, profile)
13494
}
13595
}
13696

@@ -140,81 +100,155 @@ impl EdgeBuilder for LexicalEdgeBuilder {
140100
return FxHashMap::default();
141101
}
142102

143-
let doc_freq = self.compute_doc_frequencies(fragments);
144103
let n_docs = fragments.len();
145104
let max_df = (n_docs as f64 * LEXICAL.max_df_ratio).max(1.0) as usize;
146-
let idf = self.compute_idf(&doc_freq, n_docs);
147105

148-
let tf_idf_vectors: FxHashMap<FragmentId, FxHashMap<String, f64>> = fragments
106+
// Pass 1: tokenize each fragment in parallel; flatten to per-fragment Vec<String>.
107+
let per_frag_tokens: Vec<Vec<String>> =
108+
fragments.par_iter().map(|f| Self::tokens(f)).collect();
109+
110+
// Pass 2: build the term interner serially, computing document frequency in one go.
111+
let mut interner = TermInterner::new();
112+
let mut doc_freq: Vec<u32> = Vec::new();
113+
let per_frag_term_ids: Vec<Vec<u32>> = per_frag_tokens
114+
.into_iter()
115+
.map(|tokens| {
116+
let mut seen_in_doc: FxHashSet<u32> = FxHashSet::default();
117+
let mut ids: Vec<u32> = Vec::with_capacity(tokens.len());
118+
for tok in tokens {
119+
let id = interner.intern(tok);
120+
if doc_freq.len() <= id as usize {
121+
doc_freq.resize(id as usize + 1, 0);
122+
}
123+
if seen_in_doc.insert(id) {
124+
doc_freq[id as usize] += 1;
125+
}
126+
ids.push(id);
127+
}
128+
ids
129+
})
130+
.collect();
131+
132+
let n_terms = interner.len();
133+
// Interner string-table is no longer needed once doc_freq has been built.
134+
drop(interner);
135+
136+
let n_docs_f = n_docs as f64;
137+
let mut idf: Vec<f32> = Vec::with_capacity(n_terms);
138+
for &df in &doc_freq {
139+
let v = ((n_docs_f + 1.0) / (df as f64 + 1.0)).ln() + 1.0;
140+
idf.push(v as f32);
141+
}
142+
143+
// Pass 3: build TF-IDF vectors as sparse Vec<(TermId, f32)>, normalized.
144+
let tf_idf: Vec<Vec<(u32, f32)>> = per_frag_term_ids
149145
.par_iter()
150-
.map(|frag| {
151-
let vec = self.build_tf_idf_vector(frag, &doc_freq, &idf, max_df);
152-
(frag.id.clone(), vec)
146+
.map(|term_ids| {
147+
let mut tf: FxHashMap<u32, u32> = FxHashMap::default();
148+
for &id in term_ids {
149+
*tf.entry(id).or_insert(0) += 1;
150+
}
151+
let mut vec: Vec<(u32, f32)> = Vec::with_capacity(tf.len());
152+
for (&term_id, &count) in &tf {
153+
let df = doc_freq[term_id as usize] as usize;
154+
if df == 0 || df > max_df {
155+
continue;
156+
}
157+
let term_idf = idf[term_id as usize];
158+
if (term_idf as f64) < LEXICAL.min_idf {
159+
continue;
160+
}
161+
vec.push((term_id, count as f32 * term_idf));
162+
}
163+
let norm: f32 = vec.iter().map(|(_, w)| w * w).sum::<f32>().sqrt();
164+
if norm > 0.0 {
165+
for (_, w) in &mut vec {
166+
*w /= norm;
167+
}
168+
}
169+
vec.sort_unstable_by_key(|&(id, _)| id);
170+
vec
153171
})
154172
.collect();
155173

156-
let mut postings: FxHashMap<String, Vec<(FragmentId, f64)>> = FxHashMap::default();
157-
for (frag_id, vec) in &tf_idf_vectors {
158-
for (term, &weight) in vec {
159-
postings
160-
.entry(term.clone())
161-
.or_default()
162-
.push((frag_id.clone(), weight));
174+
drop(per_frag_term_ids);
175+
drop(doc_freq);
176+
drop(idf);
177+
178+
// Pass 4: invert into postings — for each term, list of (frag_idx, weight).
179+
// Consume tf_idf as we go so it never coexists with the inverted index.
180+
let mut postings: Vec<Vec<(u32, f32)>> = vec![Vec::new(); n_terms];
181+
for (frag_idx, vec) in tf_idf.into_iter().enumerate() {
182+
for (term_id, weight) in vec {
183+
postings[term_id as usize].push((frag_idx as u32, weight));
163184
}
164185
}
165186

166-
let mut dot_products: FxHashMap<(FragmentId, FragmentId), f64> = FxHashMap::default();
167-
for (_term, posting_list) in &postings {
168-
if posting_list.len() > LEXICAL.max_postings {
187+
// Pass 5: O(F²) inner loop over each posting, capped by max_postings.
188+
// Drop each posting list as soon as we are done with it.
189+
let mut dot_products: FxHashMap<(u32, u32), f32> = FxHashMap::default();
190+
for posting_list in postings.iter_mut() {
191+
if posting_list.len() > LEXICAL.max_postings || posting_list.len() < 2 {
192+
posting_list.clear();
193+
posting_list.shrink_to_fit();
169194
continue;
170195
}
171196
for i in 0..posting_list.len() {
172-
let (ref frag_i, weight_i) = posting_list[i];
197+
let (frag_i, weight_i) = posting_list[i];
173198
for j in (i + 1)..posting_list.len() {
174-
let (ref frag_j, weight_j) = posting_list[j];
175-
let pair = if frag_i.to_string() < frag_j.to_string() {
176-
(frag_i.clone(), frag_j.clone())
199+
let (frag_j, weight_j) = posting_list[j];
200+
let pair = if frag_i < frag_j {
201+
(frag_i, frag_j)
177202
} else {
178-
(frag_j.clone(), frag_i.clone())
203+
(frag_j, frag_i)
179204
};
180205
*dot_products.entry(pair).or_insert(0.0) += weight_i * weight_j;
181206
}
182207
}
208+
posting_list.clear();
209+
posting_list.shrink_to_fit();
183210
}
211+
drop(postings);
184212

185-
let id_to_path: FxHashMap<FragmentId, &str> =
186-
fragments.iter().map(|f| (f.id.clone(), f.path())).collect();
187-
188-
let mut neighbors_by_node: FxHashMap<FragmentId, Vec<(f64, FragmentId)>> =
189-
FxHashMap::default();
213+
// Pass 6: turn pairwise similarities into per-node top-k candidate edges.
214+
let frag_paths: Vec<&str> = fragments.iter().map(|f| f.path()).collect();
215+
let mut neighbors_by_node: FxHashMap<u32, Vec<(f32, u32)>> = FxHashMap::default();
190216

191-
for ((src, dst), sim) in &dot_products {
192-
if *sim < LEXICAL.min_similarity {
217+
let min_sim = LEXICAL.min_similarity as f32;
218+
let backward_factor = LEXICAL.backward_factor as f32;
219+
for ((src_idx, dst_idx), sim) in &dot_products {
220+
if *sim < min_sim {
193221
continue;
194222
}
195-
let src_path = id_to_path.get(src).map(|s| Path::new(*s));
196-
let dst_path = id_to_path.get(dst).map(|s| Path::new(*s));
197-
let fwd = clamp_lexical_weight(*sim, src_path, dst_path);
198-
let bwd = clamp_lexical_weight(*sim, dst_path, src_path) * LEXICAL.backward_factor;
223+
let src_path = Path::new(frag_paths[*src_idx as usize]);
224+
let dst_path = Path::new(frag_paths[*dst_idx as usize]);
225+
let fwd = clamp_lexical_weight(*sim as f64, Some(src_path), Some(dst_path)) as f32;
226+
let bwd = clamp_lexical_weight(*sim as f64, Some(dst_path), Some(src_path)) as f32
227+
* backward_factor;
199228
neighbors_by_node
200-
.entry(src.clone())
229+
.entry(*src_idx)
201230
.or_default()
202-
.push((fwd, dst.clone()));
231+
.push((fwd, *dst_idx));
203232
neighbors_by_node
204-
.entry(dst.clone())
233+
.entry(*dst_idx)
205234
.or_default()
206-
.push((bwd, src.clone()));
235+
.push((bwd, *src_idx));
207236
}
208237

238+
let frag_ids: Vec<&FragmentId> = fragments.iter().map(|f| &f.id).collect();
209239
let mut edges: EdgeDict = FxHashMap::default();
210-
for (_node, mut candidates) in neighbors_by_node {
240+
for (node_idx, mut candidates) in neighbors_by_node {
211241
candidates.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
212242
candidates.truncate(LEXICAL.top_k_neighbors);
213-
for (weight, neighbor) in candidates {
214-
let key = (_node.clone(), neighbor);
243+
for (weight, neighbor_idx) in candidates {
244+
let key = (
245+
frag_ids[node_idx as usize].clone(),
246+
frag_ids[neighbor_idx as usize].clone(),
247+
);
215248
let existing = edges.get(&key).copied().unwrap_or(0.0);
216-
if weight > existing {
217-
edges.insert(key, weight);
249+
let weight_f64 = weight as f64;
250+
if weight_f64 > existing {
251+
edges.insert(key, weight_f64);
218252
}
219253
}
220254
}

diffctx/src/filtering.rs

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,10 @@ fn classify_semantic_edges(
104104
let (changed_frag, other_frag) = if src_changed { (src, dst) } else { (dst, src) };
105105

106106
let fwd_w = graph
107-
.neighbors(&changed_frag)
108-
.and_then(|nbrs| nbrs.get(&other_frag).copied())
107+
.forward_edge_weight(changed_frag, other_frag)
109108
.unwrap_or(0.0);
110109
let rev_w = graph
111-
.neighbors(&other_frag)
112-
.and_then(|nbrs| nbrs.get(&changed_frag).copied())
110+
.forward_edge_weight(other_frag, changed_frag)
113111
.unwrap_or(0.0);
114112

115113
if rev_w > fwd_w {
@@ -225,7 +223,7 @@ fn find_config_generic_code_files(
225223
}
226224

227225
pub fn filter_unrelated_fragments(
228-
fragments: Vec<Fragment>,
226+
fragments: &[Fragment],
229227
core_ids: &FxHashSet<FragmentId>,
230228
graph: &Graph,
231229
) -> Vec<Fragment> {
@@ -240,13 +238,10 @@ pub fn filter_unrelated_fragments(
240238
paths_to_remove.remove(p);
241239
}
242240

243-
if paths_to_remove.is_empty() {
244-
return fragments;
245-
}
246-
247241
fragments
248-
.into_iter()
242+
.iter()
249243
.filter(|f| !paths_to_remove.contains(&f.id.path))
244+
.cloned()
250245
.collect()
251246
}
252247

0 commit comments

Comments
 (0)