Skip to content

Commit 4ed9723

Browse files
committed
feat(review): bias semantic retrieval with graph-ranked files
Use symbol-graph related files to prioritize semantic RAG candidates before global similarity ranking, surface the graph-ranked hint in semantic context, add coverage for the ranking behavior, and mark TODO.md item #37 complete.
1 parent 1276d2a commit 4ed9723

File tree

3 files changed

+165
-6
lines changed

3 files changed

+165
-6
lines changed

TODO.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
6767
34. [ ] Add "similar implementation" lookup so repeated patterns and divergences are explicit.
6868
35. [ ] Add cross-file blast-radius summaries to findings when a change affects many callers.
6969
36. [ ] Add graph freshness/version metadata so reviews know whether they are using stale repository intelligence.
70-
37. [ ] Add graph-backed ranking of related files before semantic RAG retrieval.
70+
37. [x] Add graph-backed ranking of related files before semantic RAG retrieval.
7171
38. [ ] Add graph query traces to `dag_traces` or review artifacts for explainability and debugging.
7272
39. [x] Add graph-aware eval fixtures that require multi-hop code understanding to pass.
7373
40. [ ] Split `src/core/symbol_graph.rs` into construction, persistence, traversal, and ranking modules as it grows.

src/core/semantic.rs

Lines changed: 111 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -247,25 +247,38 @@ pub async fn semantic_context_for_diff(
247247
embedding_adapter: Option<&dyn LLMAdapter>,
248248
limit: usize,
249249
min_similarity: f32,
250+
preferred_files: &[PathBuf],
250251
) -> Vec<LLMContextChunk> {
251252
let query_texts = build_query_texts(diff, file_content);
252253
if query_texts.is_empty() {
253254
return Vec::new();
254255
}
255256

256257
let query_embeddings = embed_texts_with_fallback(embedding_adapter, &query_texts).await;
257-
let matches =
258-
find_related_chunks_for_diff(index, &query_embeddings, diff, limit, min_similarity);
258+
let matches = find_related_chunks_for_diff(
259+
index,
260+
&query_embeddings,
261+
diff,
262+
limit,
263+
min_similarity,
264+
preferred_files,
265+
);
266+
let preferred_file_ranks = build_preferred_file_ranks(preferred_files);
259267

260268
let mut seen = HashSet::new();
261269
let mut chunks = Vec::new();
262270
for semantic_match in matches {
263271
if !seen.insert(semantic_match.chunk.key.clone()) {
264272
continue;
265273
}
274+
let ranking_note = preferred_file_ranks
275+
.get(&semantic_match.chunk.file_path)
276+
.map(|rank| format!(", graph-ranked file #{}", rank + 1))
277+
.unwrap_or_default();
266278
let content = format!(
267-
"Semantic match (similarity {:.2})\nSymbol: {}\nSummary: {}\nCode:\n{}",
279+
"Semantic match (similarity {:.2}{})\nSymbol: {}\nSummary: {}\nCode:\n{}",
268280
semantic_match.similarity,
281+
ranking_note,
269282
semantic_match.chunk.symbol_name,
270283
semantic_match.chunk.summary,
271284
semantic_match.chunk.code_excerpt,
@@ -323,8 +336,10 @@ fn find_related_chunks_for_diff(
323336
diff: &UnifiedDiff,
324337
limit: usize,
325338
min_similarity: f32,
339+
preferred_files: &[PathBuf],
326340
) -> Vec<SemanticMatch> {
327341
let changed_ranges = changed_line_ranges(diff);
342+
let preferred_file_ranks = build_preferred_file_ranks(preferred_files);
328343
let mut best_matches: HashMap<String, SemanticMatch> = HashMap::new();
329344

330345
for query_embedding in query_embeddings {
@@ -351,11 +366,34 @@ fn find_related_chunks_for_diff(
351366
}
352367

353368
let mut matches = best_matches.into_values().collect::<Vec<_>>();
354-
matches.sort_by(|a, b| b.similarity.total_cmp(&a.similarity));
369+
matches.sort_by(|a, b| {
370+
let a_rank = preferred_file_ranks
371+
.get(&a.chunk.file_path)
372+
.copied()
373+
.unwrap_or(usize::MAX);
374+
let b_rank = preferred_file_ranks
375+
.get(&b.chunk.file_path)
376+
.copied()
377+
.unwrap_or(usize::MAX);
378+
a_rank
379+
.cmp(&b_rank)
380+
.then_with(|| b.similarity.total_cmp(&a.similarity))
381+
.then_with(|| a.chunk.key.cmp(&b.chunk.key))
382+
});
355383
matches.truncate(limit.max(1));
356384
matches
357385
}
358386

387+
fn build_preferred_file_ranks(preferred_files: &[PathBuf]) -> HashMap<PathBuf, usize> {
388+
let mut preferred_file_ranks = HashMap::new();
389+
for (rank, file_path) in preferred_files.iter().enumerate() {
390+
preferred_file_ranks
391+
.entry(file_path.clone())
392+
.or_insert(rank);
393+
}
394+
preferred_file_ranks
395+
}
396+
359397
pub fn find_similar_feedback_examples(
360398
store: &SemanticFeedbackStore,
361399
query_embedding: &[f32],
@@ -622,8 +660,76 @@ mod tests {
622660
}],
623661
};
624662

625-
let chunks = semantic_context_for_diff(&index, &diff, None, None, 3, 0.1).await;
663+
let chunks = semantic_context_for_diff(&index, &diff, None, None, 3, 0.1, &[]).await;
626664
assert_eq!(chunks.len(), 1);
627665
assert!(chunks[0].content.contains("Semantic match"));
628666
}
667+
668+
#[test]
669+
fn find_related_chunks_for_diff_prioritizes_graph_ranked_files() {
670+
let mut index = SemanticIndex::default();
671+
index.entries.insert(
672+
"src/graph.rs:helper:1:5".to_string(),
673+
SemanticChunk {
674+
key: "src/graph.rs:helper:1:5".to_string(),
675+
file_path: PathBuf::from("src/graph.rs"),
676+
symbol_name: "graph_helper".to_string(),
677+
line_range: (1, 5),
678+
summary: "Graph-ranked helper".to_string(),
679+
embedding_text: "graph-ranked helper".to_string(),
680+
code_excerpt: "fn graph_helper() {}".to_string(),
681+
embedding: vec![0.8, 0.6],
682+
content_hash: "graph".to_string(),
683+
},
684+
);
685+
index.entries.insert(
686+
"src/other.rs:helper:1:5".to_string(),
687+
SemanticChunk {
688+
key: "src/other.rs:helper:1:5".to_string(),
689+
file_path: PathBuf::from("src/other.rs"),
690+
symbol_name: "other_helper".to_string(),
691+
line_range: (1, 5),
692+
summary: "Higher-similarity helper".to_string(),
693+
embedding_text: "higher-similarity helper".to_string(),
694+
code_excerpt: "fn other_helper() {}".to_string(),
695+
embedding: vec![1.0, 0.0],
696+
content_hash: "other".to_string(),
697+
},
698+
);
699+
700+
let diff = UnifiedDiff {
701+
old_content: None,
702+
new_content: None,
703+
file_path: PathBuf::from("src/current.rs"),
704+
is_new: false,
705+
is_deleted: false,
706+
is_binary: false,
707+
hunks: vec![DiffHunk {
708+
old_start: 1,
709+
old_lines: 0,
710+
new_start: 1,
711+
new_lines: 1,
712+
context: String::new(),
713+
changes: vec![DiffLine {
714+
old_line_no: None,
715+
new_line_no: Some(1),
716+
change_type: ChangeType::Added,
717+
content: "graph-aware semantic ranking".to_string(),
718+
}],
719+
}],
720+
};
721+
722+
let matches = find_related_chunks_for_diff(
723+
&index,
724+
&[vec![1.0, 0.0]],
725+
&diff,
726+
2,
727+
0.1,
728+
&[PathBuf::from("src/graph.rs")],
729+
);
730+
731+
assert_eq!(matches.len(), 2);
732+
assert_eq!(matches[0].chunk.file_path, PathBuf::from("src/graph.rs"));
733+
assert_eq!(matches[1].chunk.file_path, PathBuf::from("src/other.rs"));
734+
}
629735
}

src/review/pipeline/file_context/sources/supplemental.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1+
use std::collections::HashSet;
2+
use std::path::PathBuf;
3+
14
use anyhow::Result;
25

36
use crate::config;
47
use crate::core;
58

9+
use super::super::super::context::extract_symbols_from_diff;
610
use super::super::super::services::PipelineServices;
711
use super::super::super::session::ReviewSession;
812

@@ -15,6 +19,7 @@ pub(in super::super) async fn add_semantic_context(
1519
let Some(index) = session.semantic_index.as_ref() else {
1620
return;
1721
};
22+
let preferred_files = graph_ranked_semantic_files(services, session, diff);
1823

1924
let semantic_chunks = core::semantic_context_for_diff(
2025
index,
@@ -26,11 +31,59 @@ pub(in super::super) async fn add_semantic_context(
2631
services.embedding_adapter.as_deref(),
2732
services.config.semantic_rag_top_k,
2833
services.config.semantic_rag_min_similarity,
34+
&preferred_files,
2935
)
3036
.await;
3137
context_chunks.extend(semantic_chunks);
3238
}
3339

40+
fn graph_ranked_semantic_files(
41+
services: &PipelineServices,
42+
session: &ReviewSession,
43+
diff: &core::UnifiedDiff,
44+
) -> Vec<PathBuf> {
45+
let Some(index) = session.symbol_index.as_ref() else {
46+
return Vec::new();
47+
};
48+
49+
let symbols = extract_symbols_from_diff(diff);
50+
if symbols.is_empty() {
51+
return Vec::new();
52+
}
53+
54+
let retriever = core::SymbolContextRetriever::new(
55+
index,
56+
core::SymbolRetrievalPolicy::new(
57+
services.config.symbol_index_max_locations,
58+
services.config.symbol_index_graph_hops,
59+
services.config.symbol_index_graph_max_files,
60+
),
61+
);
62+
let related_locations = retriever.related_symbol_locations(&diff.file_path, &symbols);
63+
64+
let mut preferred_files = Vec::new();
65+
let mut seen = HashSet::new();
66+
67+
for location in related_locations.definition_locations {
68+
if location.file_path == diff.file_path || !seen.insert(location.file_path.clone()) {
69+
continue;
70+
}
71+
preferred_files.push(location.file_path);
72+
}
73+
74+
let mut reference_files = related_locations
75+
.reference_locations
76+
.into_iter()
77+
.map(|location| location.file_path)
78+
.filter(|file_path| file_path != &diff.file_path)
79+
.filter(|file_path| seen.insert(file_path.clone()))
80+
.collect::<Vec<_>>();
81+
reference_files.sort();
82+
preferred_files.extend(reference_files);
83+
84+
preferred_files
85+
}
86+
3487
pub(in super::super) async fn add_path_context(
3588
services: &PipelineServices,
3689
diff: &core::UnifiedDiff,

0 commit comments

Comments
 (0)