@@ -90,13 +90,44 @@ impl Base17 {
9090 }
9191
9292 /// L1 (Manhattan) distance.
93+ ///
94+ /// AVX-512: load 16 of 17 i16 dims as i32, subtract, abs, horizontal sum.
95+ /// Last dim scalar. Total: ~3 instructions vs 17 scalar iterations.
9396 #[ inline]
9497 pub fn l1 ( & self , other : & Base17 ) -> u32 {
95- let mut d = 0u32 ;
96- for i in 0 ..BASE_DIM {
97- d += ( self . dims [ i] as i32 - other. dims [ i] as i32 ) . unsigned_abs ( ) ;
98+ #[ cfg( target_arch = "x86_64" ) ]
99+ {
100+ use crate :: simd:: I32x16 ;
101+ // Load 16 dims as i32 (sign-extend i16 → i32)
102+ let a: [ i32 ; 16 ] = [
103+ self . dims [ 0 ] as i32 , self . dims [ 1 ] as i32 , self . dims [ 2 ] as i32 , self . dims [ 3 ] as i32 ,
104+ self . dims [ 4 ] as i32 , self . dims [ 5 ] as i32 , self . dims [ 6 ] as i32 , self . dims [ 7 ] as i32 ,
105+ self . dims [ 8 ] as i32 , self . dims [ 9 ] as i32 , self . dims [ 10 ] as i32 , self . dims [ 11 ] as i32 ,
106+ self . dims [ 12 ] as i32 , self . dims [ 13 ] as i32 , self . dims [ 14 ] as i32 , self . dims [ 15 ] as i32 ,
107+ ] ;
108+ let b: [ i32 ; 16 ] = [
109+ other. dims [ 0 ] as i32 , other. dims [ 1 ] as i32 , other. dims [ 2 ] as i32 , other. dims [ 3 ] as i32 ,
110+ other. dims [ 4 ] as i32 , other. dims [ 5 ] as i32 , other. dims [ 6 ] as i32 , other. dims [ 7 ] as i32 ,
111+ other. dims [ 8 ] as i32 , other. dims [ 9 ] as i32 , other. dims [ 10 ] as i32 , other. dims [ 11 ] as i32 ,
112+ other. dims [ 12 ] as i32 , other. dims [ 13 ] as i32 , other. dims [ 14 ] as i32 , other. dims [ 15 ] as i32 ,
113+ ] ;
114+ let va = I32x16 :: from_array ( a) ;
115+ let vb = I32x16 :: from_array ( b) ;
116+ let diff = va - vb;
117+ let abs_diff = diff. abs ( ) ;
118+ let sum16 = abs_diff. reduce_sum ( ) ;
119+ // 17th dim scalar
120+ let d16 = ( self . dims [ 16 ] as i32 - other. dims [ 16 ] as i32 ) . unsigned_abs ( ) ;
121+ sum16 as u32 + d16
122+ }
123+ #[ cfg( not( target_arch = "x86_64" ) ) ]
124+ {
125+ let mut d = 0u32 ;
126+ for i in 0 ..BASE_DIM {
127+ d += ( self . dims [ i] as i32 - other. dims [ i] as i32 ) . unsigned_abs ( ) ;
128+ }
129+ d
98130 }
99- d
100131 }
101132
102133 /// PCDVQ-informed L1: weight sign dimension 20x over mantissa.
@@ -105,15 +136,34 @@ impl Base17 {
105136 /// quantization than magnitude. BF16 decomposition maps to polar:
106137 /// dim 0 = sign (direction), dims 1-6 = exponent (magnitude scale),
107138 /// dims 7-16 = mantissa (fine detail).
139+ /// PCDVQ-weighted L1 via SIMD: sign=20×, magnitude=3×, detail=1×.
108140 #[ inline]
109141 pub fn l1_weighted ( & self , other : & Base17 ) -> u32 {
110- let mut d = 0u32 ;
111- for i in 0 ..BASE_DIM {
112- let diff = ( self . dims [ i] as i32 - other. dims [ i] as i32 ) . unsigned_abs ( ) ;
113- let weight = if i == 0 { 20 } else if i < 7 { 3 } else { 1 } ;
114- d += diff * weight;
142+ #[ cfg( target_arch = "x86_64" ) ]
143+ {
144+ use crate :: simd:: I32x16 ;
145+ let a: [ i32 ; 16 ] = core:: array:: from_fn ( |i| self . dims [ i] as i32 ) ;
146+ let b: [ i32 ; 16 ] = core:: array:: from_fn ( |i| other. dims [ i] as i32 ) ;
147+ let weights: [ i32 ; 16 ] = [ 20 , 3 , 3 , 3 , 3 , 3 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ;
148+ let va = I32x16 :: from_array ( a) ;
149+ let vb = I32x16 :: from_array ( b) ;
150+ let vw = I32x16 :: from_array ( weights) ;
151+ let diff = ( va - vb) . abs ( ) ;
152+ let weighted = diff * vw;
153+ let sum16 = weighted. reduce_sum ( ) as u32 ;
154+ let d16 = ( self . dims [ 16 ] as i32 - other. dims [ 16 ] as i32 ) . unsigned_abs ( ) ;
155+ sum16 + d16
156+ }
157+ #[ cfg( not( target_arch = "x86_64" ) ) ]
158+ {
159+ let mut d = 0u32 ;
160+ for i in 0 ..BASE_DIM {
161+ let diff = ( self . dims [ i] as i32 - other. dims [ i] as i32 ) . unsigned_abs ( ) ;
162+ let weight = if i == 0 { 20 } else if i < 7 { 3 } else { 1 } ;
163+ d += diff * weight;
164+ }
165+ d
115166 }
116- d
117167 }
118168
119169 /// Sign-bit agreement (out of 17).
0 commit comments