Skip to content

Commit 0a035b2

Browse files
committed
refactor: split core semantic embedding helpers
Isolate embedding metadata compatibility, fallback vector generation, and similarity utilities so the remaining semantic refresh and retrieval logic can be decomposed without dragging the embedding layer along. Made-with: Cursor
1 parent 23dfb0c commit 0a035b2

File tree

3 files changed

+156
-139
lines changed

3 files changed

+156
-139
lines changed

TODO.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
## Immediate Queue
1212

1313
- [ ] `src/core/semantic.rs`
14-
- Split embedding metadata compatibility and adapter/fallback embedding generation.
1514
- Split source-file discovery and excerpt/query builders from index refresh bookkeeping.
1615
- Split semantic diff retrieval and feedback-example matching from feedback-store maintenance.
1716
- [ ] `src/core/symbol_index.rs`
@@ -25,7 +24,7 @@
2524
- [ ] `src/core/semantic.rs`
2625
- Split semantic chunk hashing/key generation from summary/excerpt assembly.
2726
- Split changed-range filtering and per-query match scoring from context chunk rendering.
28-
- Split feedback embedding-text/fingerprint helpers from feedback-store reconciliation.
27+
- Split feedback fingerprint helpers from feedback-store reconciliation.
2928
- [ ] `src/config.rs`
3029
- Split defaults/model-role conversion from load/deserialize paths.
3130
- Split env/path resolution from validation/migration logic.

src/core/semantic.rs

Lines changed: 11 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ use crate::core::diff_parser::{ChangeType, UnifiedDiff};
1111
use crate::core::function_chunker::chunk_diff_by_functions;
1212
use crate::core::ContextProvenance;
1313

14+
#[path = "semantic/embedding.rs"]
15+
mod embedding;
1416
#[path = "semantic/persistence.rs"]
1517
mod persistence;
1618
#[path = "semantic/types.rs"]
@@ -23,8 +25,15 @@ const SUPPORTED_CODE_EXTENSIONS: &[&str] = &[
2325
"cc", "cpp", "cxx", "hpp", "swift", "scala",
2426
];
2527

26-
use types::default_embedding_metadata;
27-
28+
#[cfg(test)]
29+
use embedding::local_hash_embedding;
30+
pub use embedding::{
31+
align_semantic_feedback_store, build_feedback_embedding_text, embed_texts_with_fallback,
32+
};
33+
use embedding::{
34+
cosine_similarity, embedding_metadata_compatible, embedding_metadata_for_adapter,
35+
merge_embedding_metadata,
36+
};
2837
pub use persistence::{
2938
default_index_path, default_semantic_feedback_path, load_semantic_feedback_store,
3039
load_semantic_index, save_semantic_feedback_store, save_semantic_index,
@@ -56,41 +65,6 @@ impl SemanticFeedbackStore {
5665
}
5766
}
5867

59-
pub fn align_semantic_feedback_store(
60-
store: &mut SemanticFeedbackStore,
61-
embedding_adapter: Option<&dyn LLMAdapter>,
62-
) {
63-
let expected = embedding_metadata_for_adapter(embedding_adapter);
64-
if !embedding_metadata_compatible(&store.embedding, &expected) {
65-
store.examples.clear();
66-
}
67-
store.embedding = merge_embedding_metadata(&store.embedding, &expected);
68-
}
69-
70-
pub async fn embed_texts_with_fallback(
71-
adapter: Option<&dyn LLMAdapter>,
72-
texts: &[String],
73-
) -> Vec<Vec<f32>> {
74-
if texts.is_empty() {
75-
return Vec::new();
76-
}
77-
78-
if let Some(adapter) = adapter {
79-
if adapter.supports_embeddings() {
80-
if let Ok(vectors) = adapter.embed(texts).await {
81-
if vectors.len() == texts.len() && vectors.iter().all(|vector| !vector.is_empty()) {
82-
return vectors;
83-
}
84-
}
85-
}
86-
}
87-
88-
texts
89-
.iter()
90-
.map(|text| local_hash_embedding(text))
91-
.collect()
92-
}
93-
9468
pub fn discover_source_files<F>(
9569
repo_root: &Path,
9670
should_exclude: F,
@@ -451,58 +425,6 @@ fn ranges_overlap(left: (usize, usize), right: (usize, usize)) -> bool {
451425
left.0 <= right.1 && right.0 <= left.1
452426
}
453427

454-
pub fn build_feedback_embedding_text(content: &str, category: &str) -> String {
455-
format!("Category: {}\nComment: {}", category, content)
456-
}
457-
458-
pub fn local_hash_embedding(text: &str) -> Vec<f32> {
459-
let mut vector = vec![0.0; FALLBACK_EMBEDDING_DIMENSIONS];
460-
let mut seen = 0usize;
461-
462-
for token in tokenize(text) {
463-
let hash = Sha256::digest(token.as_bytes());
464-
let idx = ((hash[0] as usize) << 8 | hash[1] as usize) % FALLBACK_EMBEDDING_DIMENSIONS;
465-
let weight = 1.0 + (hash[2] as f32 / 255.0);
466-
if hash[3] % 2 == 0 {
467-
vector[idx] += weight;
468-
} else {
469-
vector[idx] -= weight;
470-
}
471-
seen += 1;
472-
}
473-
474-
if seen == 0 {
475-
return vector;
476-
}
477-
478-
let norm = vector.iter().map(|value| value * value).sum::<f32>().sqrt();
479-
if norm > 0.0 {
480-
for value in &mut vector {
481-
*value /= norm;
482-
}
483-
}
484-
485-
vector
486-
}
487-
488-
pub fn cosine_similarity(left: &[f32], right: &[f32]) -> f32 {
489-
if left.is_empty() || right.is_empty() || left.len() != right.len() {
490-
return 0.0;
491-
}
492-
let mut dot = 0.0;
493-
let mut left_norm = 0.0;
494-
let mut right_norm = 0.0;
495-
for idx in 0..left.len() {
496-
dot += left[idx] * right[idx];
497-
left_norm += left[idx] * left[idx];
498-
right_norm += right[idx] * right[idx];
499-
}
500-
if left_norm == 0.0 || right_norm == 0.0 {
501-
return 0.0;
502-
}
503-
(dot / (left_norm.sqrt() * right_norm.sqrt())).clamp(-1.0, 1.0)
504-
}
505-
506428
fn build_query_texts(diff: &UnifiedDiff, file_content: Option<&str>) -> Vec<String> {
507429
let chunks = chunk_diff_by_functions(diff, file_content);
508430
let mut queries = Vec::new();
@@ -586,47 +508,6 @@ fn feedback_example_fingerprint(
586508
))
587509
}
588510

589-
fn embedding_metadata_for_adapter(adapter: Option<&dyn LLMAdapter>) -> SemanticEmbeddingMetadata {
590-
match adapter {
591-
Some(adapter) if adapter.supports_embeddings() => SemanticEmbeddingMetadata {
592-
strategy: "native".to_string(),
593-
model: adapter.model_name().to_string(),
594-
dimensions: 0,
595-
},
596-
_ => default_embedding_metadata(),
597-
}
598-
}
599-
600-
fn embedding_metadata_compatible(
601-
existing: &SemanticEmbeddingMetadata,
602-
expected: &SemanticEmbeddingMetadata,
603-
) -> bool {
604-
existing.strategy == expected.strategy
605-
&& existing.model == expected.model
606-
&& (existing.dimensions == 0
607-
|| expected.dimensions == 0
608-
|| existing.dimensions == expected.dimensions)
609-
}
610-
611-
fn merge_embedding_metadata(
612-
existing: &SemanticEmbeddingMetadata,
613-
expected: &SemanticEmbeddingMetadata,
614-
) -> SemanticEmbeddingMetadata {
615-
if !embedding_metadata_compatible(existing, expected) {
616-
return expected.clone();
617-
}
618-
619-
SemanticEmbeddingMetadata {
620-
strategy: expected.strategy.clone(),
621-
model: expected.model.clone(),
622-
dimensions: if expected.dimensions > 0 {
623-
expected.dimensions
624-
} else {
625-
existing.dimensions
626-
},
627-
}
628-
}
629-
630511
fn remove_entries_for_file(index: &mut SemanticIndex, file_path: &Path) {
631512
index
632513
.entries
@@ -652,13 +533,6 @@ fn hash_text(content: &str) -> String {
652533
format!("{:x}", digest)
653534
}
654535

655-
fn tokenize(text: &str) -> Vec<String> {
656-
text.split(|ch: char| !ch.is_alphanumeric())
657-
.filter(|token| !token.is_empty())
658-
.map(|token| token.to_ascii_lowercase())
659-
.collect()
660-
}
661-
662536
fn is_code_file(path: &Path) -> bool {
663537
path.extension()
664538
.and_then(|extension| extension.to_str())

src/core/semantic/embedding.rs

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
use sha2::{Digest, Sha256};
2+
3+
use crate::adapters::llm::LLMAdapter;
4+
5+
use super::types::default_embedding_metadata;
6+
use super::{SemanticEmbeddingMetadata, SemanticFeedbackStore};
7+
8+
pub fn align_semantic_feedback_store(
9+
store: &mut SemanticFeedbackStore,
10+
embedding_adapter: Option<&dyn LLMAdapter>,
11+
) {
12+
let expected = embedding_metadata_for_adapter(embedding_adapter);
13+
if !embedding_metadata_compatible(&store.embedding, &expected) {
14+
store.examples.clear();
15+
}
16+
store.embedding = merge_embedding_metadata(&store.embedding, &expected);
17+
}
18+
19+
pub async fn embed_texts_with_fallback(
20+
adapter: Option<&dyn LLMAdapter>,
21+
texts: &[String],
22+
) -> Vec<Vec<f32>> {
23+
if texts.is_empty() {
24+
return Vec::new();
25+
}
26+
27+
if let Some(adapter) = adapter {
28+
if adapter.supports_embeddings() {
29+
if let Ok(vectors) = adapter.embed(texts).await {
30+
if vectors.len() == texts.len() && vectors.iter().all(|vector| !vector.is_empty()) {
31+
return vectors;
32+
}
33+
}
34+
}
35+
}
36+
37+
texts
38+
.iter()
39+
.map(|text| local_hash_embedding(text))
40+
.collect()
41+
}
42+
43+
pub fn build_feedback_embedding_text(content: &str, category: &str) -> String {
44+
format!("Category: {}\nComment: {}", category, content)
45+
}
46+
47+
pub(super) fn local_hash_embedding(text: &str) -> Vec<f32> {
48+
let mut vector = vec![0.0; super::FALLBACK_EMBEDDING_DIMENSIONS];
49+
let mut seen = 0usize;
50+
51+
for token in tokenize(text) {
52+
let hash = Sha256::digest(token.as_bytes());
53+
let idx =
54+
((hash[0] as usize) << 8 | hash[1] as usize) % super::FALLBACK_EMBEDDING_DIMENSIONS;
55+
let weight = 1.0 + (hash[2] as f32 / 255.0);
56+
if hash[3] % 2 == 0 {
57+
vector[idx] += weight;
58+
} else {
59+
vector[idx] -= weight;
60+
}
61+
seen += 1;
62+
}
63+
64+
if seen == 0 {
65+
return vector;
66+
}
67+
68+
let norm = vector.iter().map(|value| value * value).sum::<f32>().sqrt();
69+
if norm > 0.0 {
70+
for value in &mut vector {
71+
*value /= norm;
72+
}
73+
}
74+
75+
vector
76+
}
77+
78+
pub(super) fn cosine_similarity(left: &[f32], right: &[f32]) -> f32 {
79+
if left.is_empty() || right.is_empty() || left.len() != right.len() {
80+
return 0.0;
81+
}
82+
let mut dot = 0.0;
83+
let mut left_norm = 0.0;
84+
let mut right_norm = 0.0;
85+
for idx in 0..left.len() {
86+
dot += left[idx] * right[idx];
87+
left_norm += left[idx] * left[idx];
88+
right_norm += right[idx] * right[idx];
89+
}
90+
if left_norm == 0.0 || right_norm == 0.0 {
91+
return 0.0;
92+
}
93+
(dot / (left_norm.sqrt() * right_norm.sqrt())).clamp(-1.0, 1.0)
94+
}
95+
96+
pub(super) fn embedding_metadata_for_adapter(
97+
adapter: Option<&dyn LLMAdapter>,
98+
) -> SemanticEmbeddingMetadata {
99+
match adapter {
100+
Some(adapter) if adapter.supports_embeddings() => SemanticEmbeddingMetadata {
101+
strategy: "native".to_string(),
102+
model: adapter.model_name().to_string(),
103+
dimensions: 0,
104+
},
105+
_ => default_embedding_metadata(),
106+
}
107+
}
108+
109+
pub(super) fn embedding_metadata_compatible(
110+
existing: &SemanticEmbeddingMetadata,
111+
expected: &SemanticEmbeddingMetadata,
112+
) -> bool {
113+
existing.strategy == expected.strategy
114+
&& existing.model == expected.model
115+
&& (existing.dimensions == 0
116+
|| expected.dimensions == 0
117+
|| existing.dimensions == expected.dimensions)
118+
}
119+
120+
pub(super) fn merge_embedding_metadata(
121+
existing: &SemanticEmbeddingMetadata,
122+
expected: &SemanticEmbeddingMetadata,
123+
) -> SemanticEmbeddingMetadata {
124+
if !embedding_metadata_compatible(existing, expected) {
125+
return expected.clone();
126+
}
127+
128+
SemanticEmbeddingMetadata {
129+
strategy: expected.strategy.clone(),
130+
model: expected.model.clone(),
131+
dimensions: if expected.dimensions > 0 {
132+
expected.dimensions
133+
} else {
134+
existing.dimensions
135+
},
136+
}
137+
}
138+
139+
fn tokenize(text: &str) -> Vec<String> {
140+
text.split(|ch: char| !ch.is_alphanumeric())
141+
.filter(|token| !token.is_empty())
142+
.map(|token| token.to_ascii_lowercase())
143+
.collect()
144+
}

0 commit comments

Comments
 (0)