@@ -34,6 +34,76 @@ pub struct Base17 {
3434 pub dims : [ i16 ; BASE_DIM ] ,
3535}
3636
37+ // ============================================================================
38+ // Multi-versioned L1 kernel: AVX-512 → AVX2 → scalar. One binary, all ISAs.
39+ // ============================================================================
40+
41+ type L1Fn = unsafe fn ( & [ i16 ; 17 ] , & [ i16 ; 17 ] ) -> u32 ;
42+
43+ #[ cfg( target_arch = "x86_64" ) ]
44+ #[ target_feature( enable = "avx512f" ) ]
45+ unsafe fn l1_avx512 ( a : & [ i16 ; 17 ] , b : & [ i16 ; 17 ] ) -> u32 {
46+ use std:: arch:: x86_64:: * ;
47+ // Load 16 i16 → 16 i32 via sign-extension
48+ let va = _mm512_cvtepi16_epi32 ( _mm256_loadu_si256 ( a. as_ptr ( ) as * const __m256i ) ) ;
49+ let vb = _mm512_cvtepi16_epi32 ( _mm256_loadu_si256 ( b. as_ptr ( ) as * const __m256i ) ) ;
50+ let diff = _mm512_sub_epi32 ( va, vb) ;
51+ let abs_diff = _mm512_abs_epi32 ( diff) ;
52+ let sum16 = _mm512_reduce_add_epi32 ( abs_diff) as u32 ;
53+ // 17th dim scalar
54+ let d16 = ( a[ 16 ] as i32 - b[ 16 ] as i32 ) . unsigned_abs ( ) ;
55+ sum16 + d16
56+ }
57+
58+ #[ cfg( target_arch = "x86_64" ) ]
59+ #[ target_feature( enable = "avx2" ) ]
60+ unsafe fn l1_avx2 ( a : & [ i16 ; 17 ] , b : & [ i16 ; 17 ] ) -> u32 {
61+ use std:: arch:: x86_64:: * ;
62+ // Process 8 dims at a time (2 passes of 8 = 16, + 1 scalar)
63+ let va0 = _mm256_cvtepi16_epi32 ( _mm_loadu_si128 ( a. as_ptr ( ) as * const __m128i ) ) ;
64+ let vb0 = _mm256_cvtepi16_epi32 ( _mm_loadu_si128 ( b. as_ptr ( ) as * const __m128i ) ) ;
65+ let diff0 = _mm256_sub_epi32 ( va0, vb0) ;
66+ let abs0 = _mm256_abs_epi32 ( diff0) ;
67+
68+ let va1 = _mm256_cvtepi16_epi32 ( _mm_loadu_si128 ( a[ 8 ..] . as_ptr ( ) as * const __m128i ) ) ;
69+ let vb1 = _mm256_cvtepi16_epi32 ( _mm_loadu_si128 ( b[ 8 ..] . as_ptr ( ) as * const __m128i ) ) ;
70+ let diff1 = _mm256_sub_epi32 ( va1, vb1) ;
71+ let abs1 = _mm256_abs_epi32 ( diff1) ;
72+
73+ let sum = _mm256_add_epi32 ( abs0, abs1) ;
74+ // Horizontal sum of 8 i32
75+ let hi128 = _mm256_extracti128_si256 ( sum, 1 ) ;
76+ let lo128 = _mm256_castsi256_si128 ( sum) ;
77+ let sum128 = _mm_add_epi32 ( lo128, hi128) ;
78+ let sum64 = _mm_add_epi32 ( sum128, _mm_srli_si128 ( sum128, 8 ) ) ;
79+ let sum32 = _mm_add_epi32 ( sum64, _mm_srli_si128 ( sum64, 4 ) ) ;
80+ let sum16 = _mm_extract_epi32 ( sum32, 0 ) as u32 ;
81+ // 17th dim scalar
82+ let d16 = ( a[ 16 ] as i32 - b[ 16 ] as i32 ) . unsigned_abs ( ) ;
83+ sum16 + d16
84+ }
85+
86+ fn l1_scalar ( a : & [ i16 ; 17 ] , b : & [ i16 ; 17 ] ) -> u32 {
87+ let mut d = 0u32 ;
88+ for i in 0 ..17 {
89+ d += ( a[ i] as i32 - b[ i] as i32 ) . unsigned_abs ( ) ;
90+ }
91+ d
92+ }
93+
94+ static L1_KERNEL : std:: sync:: LazyLock < L1Fn > = std:: sync:: LazyLock :: new ( || {
95+ #[ cfg( target_arch = "x86_64" ) ]
96+ {
97+ if is_x86_feature_detected ! ( "avx512f" ) {
98+ return l1_avx512 as L1Fn ;
99+ }
100+ if is_x86_feature_detected ! ( "avx2" ) {
101+ return l1_avx2 as L1Fn ;
102+ }
103+ }
104+ l1_scalar as L1Fn
105+ } ) ;
106+
37107/// SPO triple of Base17 patterns. 102 bytes.
38108#[ derive( Clone , Debug , PartialEq , Eq ) ]
39109pub struct SpoBase17 {
@@ -89,45 +159,14 @@ impl Base17 {
89159 Base17 { dims : [ 0i16 ; BASE_DIM ] }
90160 }
91161
92- /// L1 (Manhattan) distance.
162+ /// L1 (Manhattan) distance — multi-versioned kernel .
93163 ///
94- /// AVX-512: load 16 of 17 i16 dims as i32, subtract, abs, horizontal sum .
95- /// Last dim scalar. Total: ~3 instructions vs 17 scalar iterations .
164+ /// Runtime dispatch via LazyLock: AVX-512 → AVX2 → scalar .
165+ /// One binary serves all ISAs .
96166 #[ inline]
97167 pub fn l1 ( & self , other : & Base17 ) -> u32 {
98- #[ cfg( target_arch = "x86_64" ) ]
99- {
100- use crate :: simd:: I32x16 ;
101- // Load 16 dims as i32 (sign-extend i16 → i32)
102- let a: [ i32 ; 16 ] = [
103- self . dims [ 0 ] as i32 , self . dims [ 1 ] as i32 , self . dims [ 2 ] as i32 , self . dims [ 3 ] as i32 ,
104- self . dims [ 4 ] as i32 , self . dims [ 5 ] as i32 , self . dims [ 6 ] as i32 , self . dims [ 7 ] as i32 ,
105- self . dims [ 8 ] as i32 , self . dims [ 9 ] as i32 , self . dims [ 10 ] as i32 , self . dims [ 11 ] as i32 ,
106- self . dims [ 12 ] as i32 , self . dims [ 13 ] as i32 , self . dims [ 14 ] as i32 , self . dims [ 15 ] as i32 ,
107- ] ;
108- let b: [ i32 ; 16 ] = [
109- other. dims [ 0 ] as i32 , other. dims [ 1 ] as i32 , other. dims [ 2 ] as i32 , other. dims [ 3 ] as i32 ,
110- other. dims [ 4 ] as i32 , other. dims [ 5 ] as i32 , other. dims [ 6 ] as i32 , other. dims [ 7 ] as i32 ,
111- other. dims [ 8 ] as i32 , other. dims [ 9 ] as i32 , other. dims [ 10 ] as i32 , other. dims [ 11 ] as i32 ,
112- other. dims [ 12 ] as i32 , other. dims [ 13 ] as i32 , other. dims [ 14 ] as i32 , other. dims [ 15 ] as i32 ,
113- ] ;
114- let va = I32x16 :: from_array ( a) ;
115- let vb = I32x16 :: from_array ( b) ;
116- let diff = va - vb;
117- let abs_diff = diff. abs ( ) ;
118- let sum16 = abs_diff. reduce_sum ( ) ;
119- // 17th dim scalar
120- let d16 = ( self . dims [ 16 ] as i32 - other. dims [ 16 ] as i32 ) . unsigned_abs ( ) ;
121- sum16 as u32 + d16
122- }
123- #[ cfg( not( target_arch = "x86_64" ) ) ]
124- {
125- let mut d = 0u32 ;
126- for i in 0 ..BASE_DIM {
127- d += ( self . dims [ i] as i32 - other. dims [ i] as i32 ) . unsigned_abs ( ) ;
128- }
129- d
130- }
168+ // SAFETY: LazyLock guarantees the selected kernel matches CPU features.
169+ unsafe { L1_KERNEL ( & self . dims , & other. dims ) }
131170 }
132171
133172 /// PCDVQ-informed L1: weight sign dimension 20x over mantissa.
0 commit comments