@@ -11,6 +11,8 @@ use crate::core::diff_parser::{ChangeType, UnifiedDiff};
1111use crate :: core:: function_chunker:: chunk_diff_by_functions;
1212use crate :: core:: ContextProvenance ;
1313
14+ #[ path = "semantic/embedding.rs" ]
15+ mod embedding;
1416#[ path = "semantic/persistence.rs" ]
1517mod persistence;
1618#[ path = "semantic/types.rs" ]
@@ -23,8 +25,15 @@ const SUPPORTED_CODE_EXTENSIONS: &[&str] = &[
2325 "cc" , "cpp" , "cxx" , "hpp" , "swift" , "scala" ,
2426] ;
2527
26- use types:: default_embedding_metadata;
27-
28+ #[ cfg( test) ]
29+ use embedding:: local_hash_embedding;
30+ pub use embedding:: {
31+ align_semantic_feedback_store, build_feedback_embedding_text, embed_texts_with_fallback,
32+ } ;
33+ use embedding:: {
34+ cosine_similarity, embedding_metadata_compatible, embedding_metadata_for_adapter,
35+ merge_embedding_metadata,
36+ } ;
2837pub use persistence:: {
2938 default_index_path, default_semantic_feedback_path, load_semantic_feedback_store,
3039 load_semantic_index, save_semantic_feedback_store, save_semantic_index,
@@ -56,41 +65,6 @@ impl SemanticFeedbackStore {
5665 }
5766}
5867
59- pub fn align_semantic_feedback_store (
60- store : & mut SemanticFeedbackStore ,
61- embedding_adapter : Option < & dyn LLMAdapter > ,
62- ) {
63- let expected = embedding_metadata_for_adapter ( embedding_adapter) ;
64- if !embedding_metadata_compatible ( & store. embedding , & expected) {
65- store. examples . clear ( ) ;
66- }
67- store. embedding = merge_embedding_metadata ( & store. embedding , & expected) ;
68- }
69-
70- pub async fn embed_texts_with_fallback (
71- adapter : Option < & dyn LLMAdapter > ,
72- texts : & [ String ] ,
73- ) -> Vec < Vec < f32 > > {
74- if texts. is_empty ( ) {
75- return Vec :: new ( ) ;
76- }
77-
78- if let Some ( adapter) = adapter {
79- if adapter. supports_embeddings ( ) {
80- if let Ok ( vectors) = adapter. embed ( texts) . await {
81- if vectors. len ( ) == texts. len ( ) && vectors. iter ( ) . all ( |vector| !vector. is_empty ( ) ) {
82- return vectors;
83- }
84- }
85- }
86- }
87-
88- texts
89- . iter ( )
90- . map ( |text| local_hash_embedding ( text) )
91- . collect ( )
92- }
93-
9468pub fn discover_source_files < F > (
9569 repo_root : & Path ,
9670 should_exclude : F ,
@@ -451,58 +425,6 @@ fn ranges_overlap(left: (usize, usize), right: (usize, usize)) -> bool {
451425 left. 0 <= right. 1 && right. 0 <= left. 1
452426}
453427
454- pub fn build_feedback_embedding_text ( content : & str , category : & str ) -> String {
455- format ! ( "Category: {}\n Comment: {}" , category, content)
456- }
457-
458- pub fn local_hash_embedding ( text : & str ) -> Vec < f32 > {
459- let mut vector = vec ! [ 0.0 ; FALLBACK_EMBEDDING_DIMENSIONS ] ;
460- let mut seen = 0usize ;
461-
462- for token in tokenize ( text) {
463- let hash = Sha256 :: digest ( token. as_bytes ( ) ) ;
464- let idx = ( ( hash[ 0 ] as usize ) << 8 | hash[ 1 ] as usize ) % FALLBACK_EMBEDDING_DIMENSIONS ;
465- let weight = 1.0 + ( hash[ 2 ] as f32 / 255.0 ) ;
466- if hash[ 3 ] % 2 == 0 {
467- vector[ idx] += weight;
468- } else {
469- vector[ idx] -= weight;
470- }
471- seen += 1 ;
472- }
473-
474- if seen == 0 {
475- return vector;
476- }
477-
478- let norm = vector. iter ( ) . map ( |value| value * value) . sum :: < f32 > ( ) . sqrt ( ) ;
479- if norm > 0.0 {
480- for value in & mut vector {
481- * value /= norm;
482- }
483- }
484-
485- vector
486- }
487-
488- pub fn cosine_similarity ( left : & [ f32 ] , right : & [ f32 ] ) -> f32 {
489- if left. is_empty ( ) || right. is_empty ( ) || left. len ( ) != right. len ( ) {
490- return 0.0 ;
491- }
492- let mut dot = 0.0 ;
493- let mut left_norm = 0.0 ;
494- let mut right_norm = 0.0 ;
495- for idx in 0 ..left. len ( ) {
496- dot += left[ idx] * right[ idx] ;
497- left_norm += left[ idx] * left[ idx] ;
498- right_norm += right[ idx] * right[ idx] ;
499- }
500- if left_norm == 0.0 || right_norm == 0.0 {
501- return 0.0 ;
502- }
503- ( dot / ( left_norm. sqrt ( ) * right_norm. sqrt ( ) ) ) . clamp ( -1.0 , 1.0 )
504- }
505-
506428fn build_query_texts ( diff : & UnifiedDiff , file_content : Option < & str > ) -> Vec < String > {
507429 let chunks = chunk_diff_by_functions ( diff, file_content) ;
508430 let mut queries = Vec :: new ( ) ;
@@ -586,47 +508,6 @@ fn feedback_example_fingerprint(
586508 ) )
587509}
588510
589- fn embedding_metadata_for_adapter ( adapter : Option < & dyn LLMAdapter > ) -> SemanticEmbeddingMetadata {
590- match adapter {
591- Some ( adapter) if adapter. supports_embeddings ( ) => SemanticEmbeddingMetadata {
592- strategy : "native" . to_string ( ) ,
593- model : adapter. model_name ( ) . to_string ( ) ,
594- dimensions : 0 ,
595- } ,
596- _ => default_embedding_metadata ( ) ,
597- }
598- }
599-
600- fn embedding_metadata_compatible (
601- existing : & SemanticEmbeddingMetadata ,
602- expected : & SemanticEmbeddingMetadata ,
603- ) -> bool {
604- existing. strategy == expected. strategy
605- && existing. model == expected. model
606- && ( existing. dimensions == 0
607- || expected. dimensions == 0
608- || existing. dimensions == expected. dimensions )
609- }
610-
611- fn merge_embedding_metadata (
612- existing : & SemanticEmbeddingMetadata ,
613- expected : & SemanticEmbeddingMetadata ,
614- ) -> SemanticEmbeddingMetadata {
615- if !embedding_metadata_compatible ( existing, expected) {
616- return expected. clone ( ) ;
617- }
618-
619- SemanticEmbeddingMetadata {
620- strategy : expected. strategy . clone ( ) ,
621- model : expected. model . clone ( ) ,
622- dimensions : if expected. dimensions > 0 {
623- expected. dimensions
624- } else {
625- existing. dimensions
626- } ,
627- }
628- }
629-
630511fn remove_entries_for_file ( index : & mut SemanticIndex , file_path : & Path ) {
631512 index
632513 . entries
@@ -652,13 +533,6 @@ fn hash_text(content: &str) -> String {
652533 format ! ( "{:x}" , digest)
653534}
654535
655- fn tokenize ( text : & str ) -> Vec < String > {
656- text. split ( |ch : char | !ch. is_alphanumeric ( ) )
657- . filter ( |token| !token. is_empty ( ) )
658- . map ( |token| token. to_ascii_lowercase ( ) )
659- . collect ( )
660- }
661-
662536fn is_code_file ( path : & Path ) -> bool {
663537 path. extension ( )
664538 . and_then ( |extension| extension. to_str ( ) )
0 commit comments