@@ -125,16 +125,69 @@ pub fn try_vnni_matmul_u8(
125125 false
126126}
127127
128- /// Build a k×k distance table from k centroids using VNNI if available.
128+ /// Build a k×k COSINE SIMILARITY table from f32 centroids.
129+ ///
130+ /// Takes raw f32 centroids, normalizes to unit vectors, quantizes,
131+ /// runs tiered VNNI/AMX dot product, maps to u8 [0, 255].
132+ ///
133+ /// This IS the ThinkingEngine's brain. cosine[-1,1] → u8[0,255].
134+ /// 128 = orthogonal. 255 = identical. 0 = opposite.
135+ ///
136+ /// centroids_f32: [k × dim] raw f32 centroids (row-major)
137+ /// Returns: [k × k] u8 cosine similarity table
138+ #[ cfg( feature = "std" ) ]
139+ pub fn build_cosine_table ( centroids_f32 : & [ f32 ] , k : usize , dim : usize ) -> Vec < u8 > {
140+ assert_eq ! ( centroids_f32. len( ) , k * dim) ;
141+
142+ // Step 1: Normalize each centroid to unit vector
143+ let mut normed = vec ! [ 0.0f32 ; k * dim] ;
144+ for i in 0 ..k {
145+ let row = & centroids_f32[ i * dim..( i + 1 ) * dim] ;
146+ let norm: f32 = row. iter ( ) . map ( |v| v * v) . sum :: < f32 > ( ) . sqrt ( ) ;
147+ let inv_norm = if norm > 1e-10 { 1.0 / norm } else { 0.0 } ;
148+ for d in 0 ..dim {
149+ normed[ i * dim + d] = row[ d] * inv_norm;
150+ }
151+ }
152+
153+ // Step 2: Quantize normalized [-1, 1] → u8 [0, 255]
154+ // After normalization, values are in [-1, 1].
155+ // Map: u8 = round((value + 1.0) * 127.5)
156+ let centroids_u8: Vec < u8 > = normed. iter ( )
157+ . map ( |& v| ( ( v + 1.0 ) * 127.5 ) . round ( ) . clamp ( 0.0 , 255.0 ) as u8 )
158+ . collect ( ) ;
159+
160+ // Step 3: Compute dot products using tiered VNNI dispatch
161+ let raw_dots = build_distance_table_vnni ( & centroids_u8, k, dim) ;
162+
163+ // Step 4: Map i32 dot products → u8 cosine similarity [0, 255]
164+ // The dot product of two unit vectors quantized to u8 [0,255]:
165+ // max dot (identical) = sum of (u8_i)² over dim
166+ // min dot (opposite) = much lower
167+ // Find actual min/max to scale properly
168+ let min_dot = raw_dots. iter ( ) . copied ( ) . min ( ) . unwrap_or ( 0 ) as f64 ;
169+ let max_dot = raw_dots. iter ( ) . copied ( ) . max ( ) . unwrap_or ( 1 ) as f64 ;
170+ let range = ( max_dot - min_dot) . max ( 1.0 ) ;
171+
172+ let mut table = vec ! [ 128u8 ; k * k] ; // 128 = default orthogonal
173+ for i in 0 ..k {
174+ for j in 0 ..k {
175+ let raw = raw_dots[ i * k + j] as f64 ;
176+ let normalized = ( raw - min_dot) / range; // [0, 1]
177+ table[ i * k + j] = ( normalized * 255.0 ) . round ( ) . clamp ( 0.0 , 255.0 ) as u8 ;
178+ }
179+ }
180+
181+ table
182+ }
183+
184+ /// Build a k×k RAW DOT PRODUCT table from u8 centroids using VNNI if available.
129185///
130186/// centroids_u8: [k × dim] quantized codebook centroids (u8, row-major)
131187/// Returns: [k × k] i32 dot product matrix (symmetric)
132188///
133- /// Uses VNNI dot product (64 MACs/instruction) for each centroid pair.
134- /// Symmetric: only computes upper triangle, mirrors to lower.
135- ///
136- /// This IS the ThinkingEngine's brain construction step.
137- /// 4096² = 16M dot products. With VNNI: ~1:20h for large dim.
189+ /// For cosine: use build_cosine_table() which normalizes first.
190+ /// This function is for raw dot products when centroids are already u8.
138191#[ cfg( feature = "std" ) ]
139192pub fn build_distance_table_vnni ( centroids_u8 : & [ u8 ] , k : usize , dim : usize ) -> Vec < i32 > {
140193 assert_eq ! ( centroids_u8. len( ) , k * dim) ;
0 commit comments