@@ -9,41 +9,80 @@ import (
99 "github.com/SimplyLiz/CodeMCP/internal/lip"
1010)
1111
12- // lipSeedN is the number of top-ranked results used to build the query centroid.
13- // More seeds → more stable centroid; fewer → faster. Five is the sweet spot for
14- // typical search result sets (10–50 candidates).
15- const lipSeedN = 5
12+ // RerankConfig controls the LIP semantic re-ranking blend. The zero value is
13+ // not valid — use DefaultRerankConfig. Surfaced as a struct so future empirical
14+ // tuning (golden-query harness, per-repo overrides) does not require changing
15+ // call sites.
16+ type RerankConfig struct {
17+ // LexicalWeight is the weight on the 1/rank position score.
18+ LexicalWeight float64
19+ // SemanticWeight is the weight on the centroid cosine similarity.
20+ SemanticWeight float64
21+ // SeedCount is the number of top results used to build the query centroid.
22+ SeedCount int
23+ // MinCoherence is the minimum centroid-norm required to trust the semantic
24+ // signal. Each seed vector is L2-normalised, so the position-weighted
25+ // centroid norm lies in [0, 1]: 1.0 means seeds point the same direction,
26+ // near-0 means they cancel. Below this threshold we fall back to lexical
27+ // ranking to avoid amplifying noise when top lexical results are
28+ // semantically scattered.
29+ MinCoherence float64
30+ }
31+
32+ // DefaultRerankConfig returns the current production defaults. These values
33+ // predate an empirical tuning pass — treat them as a starting point, not a
34+ // proven optimum.
35+ func DefaultRerankConfig () RerankConfig {
36+ return RerankConfig {
37+ LexicalWeight : 0.6 ,
38+ SemanticWeight : 0.4 ,
39+ SeedCount : 5 ,
40+ MinCoherence : 0.35 ,
41+ }
42+ }
1643
1744// RerankWithLIP re-ranks results using semantic similarity from LIP embeddings.
1845// It is the Fast-tier counterpart of RerankWithPPR: where PPR uses graph
1946// proximity over the SCIP symbol graph, this function uses file embedding
20- // dot-product similarity as the second ranking signal.
47+ // cosine similarity as the second ranking signal.
2148//
2249// Algorithm:
2350// 1. Fetch embeddings for all candidate files in a single batch RPC.
24- // 2. Average the top-lipSeedN seed vectors → L2-normalised query centroid.
25- // 3. Score every candidate: 0.6 * lexical_position + 0.4 * dot_product(vec, centroid).
26- // 4. Re-sort by combined score.
51+ // 2. Build a position-weighted, L2-normalised seed centroid from the top
52+ // SeedCount candidates (weight 1/(rank+1) so top-1 dominates softly).
53+ // 3. Measure centroid coherence; if seeds disagree, return results unchanged.
54+ // 4. Score every candidate: LexicalWeight * 1/rank + SemanticWeight * cosine.
55+ // 5. Re-sort by combined score.
2756//
28- // Degrades silently when LIP is unavailable — the original results are returned
29- // unchanged, so callers never need to handle the failure path specially.
30- func RerankWithLIP (_ context.Context , results []SearchResultItem , repoRoot , _ string ) ([]SearchResultItem , error ) {
57+ // Degrades silently when LIP is unavailable or the signal is weak — the
58+ // original results are returned unchanged.
59+ func RerankWithLIP (ctx context.Context , results []SearchResultItem , repoRoot , query string ) ([]SearchResultItem , error ) {
60+ return rerankWithLIP (ctx , results , repoRoot , query , DefaultRerankConfig (), lip .GetEmbeddingsBatch )
61+ }
62+
63+ // embedBatchFn matches lip.GetEmbeddingsBatch and exists so tests can inject
64+ // synthetic embeddings without a running daemon.
65+ type embedBatchFn func (uris []string , model string ) ([][]float32 , error )
66+
67+ func rerankWithLIP (
68+ _ context.Context ,
69+ results []SearchResultItem ,
70+ repoRoot , _ string ,
71+ cfg RerankConfig ,
72+ embed embedBatchFn ,
73+ ) ([]SearchResultItem , error ) {
3174 if len (results ) <= 3 {
3275 return results , nil
3376 }
3477
35- // Build URI list, preserving index correspondence with results.
3678 uris := make ([]string , len (results ))
3779 for i , r := range results {
3880 uris [i ] = lipFileURI (repoRoot , r )
3981 }
4082
41- // Single batch RPC instead of N individual round-trips.
42- batchVecs , _ := lip .GetEmbeddingsBatch (uris , "" )
43- vecs := batchVecs
83+ vecs , _ := embed (uris , "" )
4484 if vecs == nil {
45- // LIP not running — allocate a nil slice so the rest of the function is uniform.
46- vecs = make ([][]float32 , len (results ))
85+ return results , nil
4786 }
4887
4988 dims := 0
@@ -57,65 +96,110 @@ func RerankWithLIP(_ context.Context, results []SearchResultItem, repoRoot, _ st
5796 return results , nil
5897 }
5998
60- // Build centroid from the top-N seeds (lexical ordering).
61- seedN := min (lipSeedN , len (results ))
99+ centroid , coherence := buildSeedCentroid (vecs , cfg .SeedCount , dims )
100+ if centroid == nil || coherence < cfg .MinCoherence {
101+ // Seeds too scattered (or too few) to trust the semantic signal.
102+ return results , nil
103+ }
104+
105+ // Score every candidate and re-sort.
106+ type scored struct {
107+ item SearchResultItem
108+ score float64
109+ }
110+ out := make ([]scored , len (results ))
111+ for i , r := range results {
112+ posScore := 1.0 / (float64 (i ) + 1.0 )
113+ semScore := cosine (vecs [i ], centroid )
114+ out [i ] = scored {item : r , score : cfg .LexicalWeight * posScore + cfg .SemanticWeight * semScore }
115+ }
116+
117+ sort .Slice (out , func (i , j int ) bool { return out [i ].score > out [j ].score })
118+
119+ reranked := make ([]SearchResultItem , len (out ))
120+ for i , s := range out {
121+ reranked [i ] = s .item
122+ }
123+ return reranked , nil
124+ }
125+
126+ // buildSeedCentroid builds a position-weighted centroid from the top-N seed
127+ // vectors. Each seed is L2-normalised before weighting so the resulting
128+ // centroid norm is a direct coherence measure in [0, 1] — 1.0 when seeds
129+ // point the same direction, near-0 when they cancel.
130+ //
131+ // Returns (nil, 0) when fewer than two seeds have embeddings.
132+ func buildSeedCentroid (vecs [][]float32 , seedN , dims int ) ([]float64 , float64 ) {
133+ if seedN > len (vecs ) {
134+ seedN = len (vecs )
135+ }
136+
62137 centroid := make ([]float64 , dims )
138+ totalW := 0.0
63139 nSeeds := 0
64140 for i := 0 ; i < seedN ; i ++ {
65- if vecs [i ] == nil {
141+ if len ( vecs [i ]) == 0 {
66142 continue
67143 }
144+ // L2-normalise the seed so every contribution has unit magnitude.
145+ var norm float64
146+ for _ , x := range vecs [i ] {
147+ norm += float64 (x ) * float64 (x )
148+ }
149+ norm = math .Sqrt (norm )
150+ if norm == 0 {
151+ continue
152+ }
153+ w := 1.0 / float64 (i + 1 )
154+ totalW += w
68155 for d , x := range vecs [i ] {
69- centroid [d ] += float64 (x )
156+ centroid [d ] += w * float64 (x ) / norm
70157 }
71158 nSeeds ++
72159 }
73- if nSeeds < 2 {
74- // Not enough seed embeddings to form a meaningful centroid.
75- return results , nil
160+ if nSeeds < 2 || totalW == 0 {
161+ return nil , 0
76162 }
163+
164+ // Normalise by total weight so the centroid lives in the unit ball.
165+ // With each seed a unit vector and weights summing to totalW, the
166+ // unweighted-normalised centroid norm is bounded by 1 — that's our
167+ // coherence metric.
77168 for d := range centroid {
78- centroid [d ] /= float64 ( nSeeds )
169+ centroid [d ] /= totalW
79170 }
80- // L2-normalise so dot products are cosine similarities.
81- var norm float64
171+ var coherence float64
82172 for _ , x := range centroid {
83- norm += x * x
173+ coherence += x * x
84174 }
85- if norm = math .Sqrt (norm ); norm > 0 {
175+ coherence = math .Sqrt (coherence )
176+
177+ // Re-normalise centroid to unit length for cosine similarity scoring.
178+ if coherence > 0 {
86179 for d := range centroid {
87- centroid [d ] /= norm
180+ centroid [d ] /= coherence
88181 }
89182 }
183+ return centroid , coherence
184+ }
90185
91- // Score every candidate and re-sort.
92- type scored struct {
93- item SearchResultItem
94- score float64
186+ // cosine returns the cosine similarity between a float32 vector and a
187+ // unit-length float64 centroid. Returns 0 when v has no length.
188+ func cosine (v []float32 , centroid []float64 ) float64 {
189+ if len (v ) == 0 || len (v ) != len (centroid ) {
190+ return 0
95191 }
96- out := make ([]scored , len (results ))
97- for i , r := range results {
98- // Lexical position score: decays as 1/rank (same shape as PPR's positionScore).
99- posScore := 1.0 / (float64 (i ) + 1.0 )
100-
101- // Semantic similarity: dot product with normalised centroid.
102- semScore := 0.0
103- if vecs [i ] != nil {
104- for d , x := range vecs [i ] {
105- semScore += float64 (x ) * centroid [d ]
106- }
107- }
108-
109- out [i ] = scored {item : r , score : 0.6 * posScore + 0.4 * semScore }
192+ var dot , norm float64
193+ for d , x := range v {
194+ f := float64 (x )
195+ dot += f * centroid [d ]
196+ norm += f * f
110197 }
111-
112- sort .Slice (out , func (i , j int ) bool { return out [i ].score > out [j ].score })
113-
114- reranked := make ([]SearchResultItem , len (out ))
115- for i , s := range out {
116- reranked [i ] = s .item
198+ norm = math .Sqrt (norm )
199+ if norm == 0 {
200+ return 0
117201 }
118- return reranked , nil
202+ return dot / norm
119203}
120204
121205// SemanticSearchWithLIP queries LIP's nearest-neighbour index for files matching
0 commit comments