Skip to content

Commit 9d570bc

Browse files
committed
feat: multi-versioned L1 kernel — AVX-512 → AVX2 → scalar via LazyLock
Per-function #[target_feature(enable = "avx512f")] / "avx2". LazyLock runtime detection, one binary for all ISAs. l1_avx512: _mm512_cvtepi16_epi32 + _mm512_sub + _mm512_abs + reduce_add l1_avx2: _mm256_cvtepi16_epi32 + _mm256_sub + _mm256_abs + horizontal sum l1_scalar: for i in 0..17 (non-x86 fallback) 605M lookups/sec (LazyLock) vs 728M (hardcoded AVX-512). 19 tests passing. .cargo/config.toml: no global target-cpu. https://claude.ai/code/session_01M3at4EuHVvQ8S95mSnKgtK
1 parent ae6f721 commit 9d570bc

2 files changed

Lines changed: 78 additions & 39 deletions

File tree

.cargo/config.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[build]
2-
# x86-64-v4 = AVX-512 mandatory. Railway + Claude backend = always AVX-512.
3-
# GitHub CI: set CARGO_BUILD_RUSTFLAGS="-C target-cpu=x86-64-v3" for AVX2 fallback.
4-
rustflags = ["-C", "target-cpu=x86-64-v4"]
2+
# No global target-cpu. Each kernel uses #[target_feature(enable = "avx512f")]
3+
# per-function, with LazyLock runtime detection. One binary, all ISAs.
4+
# Railway (AVX-512) and GitHub CI (AVX2) use the same binary.

src/hpc/bgz17_bridge.rs

Lines changed: 75 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,76 @@ pub struct Base17 {
3434
pub dims: [i16; BASE_DIM],
3535
}
3636

37+
// ============================================================================
38+
// Multi-versioned L1 kernel: AVX-512 → AVX2 → scalar. One binary, all ISAs.
39+
// ============================================================================
40+
41+
type L1Fn = unsafe fn(&[i16; 17], &[i16; 17]) -> u32;
42+
43+
#[cfg(target_arch = "x86_64")]
44+
#[target_feature(enable = "avx512f")]
45+
unsafe fn l1_avx512(a: &[i16; 17], b: &[i16; 17]) -> u32 {
46+
use std::arch::x86_64::*;
47+
// Load 16 i16 → 16 i32 via sign-extension
48+
let va = _mm512_cvtepi16_epi32(_mm256_loadu_si256(a.as_ptr() as *const __m256i));
49+
let vb = _mm512_cvtepi16_epi32(_mm256_loadu_si256(b.as_ptr() as *const __m256i));
50+
let diff = _mm512_sub_epi32(va, vb);
51+
let abs_diff = _mm512_abs_epi32(diff);
52+
let sum16 = _mm512_reduce_add_epi32(abs_diff) as u32;
53+
// 17th dim scalar
54+
let d16 = (a[16] as i32 - b[16] as i32).unsigned_abs();
55+
sum16 + d16
56+
}
57+
58+
#[cfg(target_arch = "x86_64")]
59+
#[target_feature(enable = "avx2")]
60+
unsafe fn l1_avx2(a: &[i16; 17], b: &[i16; 17]) -> u32 {
61+
use std::arch::x86_64::*;
62+
// Process 8 dims at a time (2 passes of 8 = 16, + 1 scalar)
63+
let va0 = _mm256_cvtepi16_epi32(_mm_loadu_si128(a.as_ptr() as *const __m128i));
64+
let vb0 = _mm256_cvtepi16_epi32(_mm_loadu_si128(b.as_ptr() as *const __m128i));
65+
let diff0 = _mm256_sub_epi32(va0, vb0);
66+
let abs0 = _mm256_abs_epi32(diff0);
67+
68+
let va1 = _mm256_cvtepi16_epi32(_mm_loadu_si128(a[8..].as_ptr() as *const __m128i));
69+
let vb1 = _mm256_cvtepi16_epi32(_mm_loadu_si128(b[8..].as_ptr() as *const __m128i));
70+
let diff1 = _mm256_sub_epi32(va1, vb1);
71+
let abs1 = _mm256_abs_epi32(diff1);
72+
73+
let sum = _mm256_add_epi32(abs0, abs1);
74+
// Horizontal sum of 8 i32
75+
let hi128 = _mm256_extracti128_si256(sum, 1);
76+
let lo128 = _mm256_castsi256_si128(sum);
77+
let sum128 = _mm_add_epi32(lo128, hi128);
78+
let sum64 = _mm_add_epi32(sum128, _mm_srli_si128(sum128, 8));
79+
let sum32 = _mm_add_epi32(sum64, _mm_srli_si128(sum64, 4));
80+
let sum16 = _mm_extract_epi32(sum32, 0) as u32;
81+
// 17th dim scalar
82+
let d16 = (a[16] as i32 - b[16] as i32).unsigned_abs();
83+
sum16 + d16
84+
}
85+
86+
fn l1_scalar(a: &[i16; 17], b: &[i16; 17]) -> u32 {
87+
let mut d = 0u32;
88+
for i in 0..17 {
89+
d += (a[i] as i32 - b[i] as i32).unsigned_abs();
90+
}
91+
d
92+
}
93+
94+
static L1_KERNEL: std::sync::LazyLock<L1Fn> = std::sync::LazyLock::new(|| {
95+
#[cfg(target_arch = "x86_64")]
96+
{
97+
if is_x86_feature_detected!("avx512f") {
98+
return l1_avx512 as L1Fn;
99+
}
100+
if is_x86_feature_detected!("avx2") {
101+
return l1_avx2 as L1Fn;
102+
}
103+
}
104+
l1_scalar as L1Fn
105+
});
106+
37107
/// SPO triple of Base17 patterns. 102 bytes.
38108
#[derive(Clone, Debug, PartialEq, Eq)]
39109
pub struct SpoBase17 {
@@ -89,45 +159,14 @@ impl Base17 {
89159
Base17 { dims: [0i16; BASE_DIM] }
90160
}
91161

92-
/// L1 (Manhattan) distance.
162+
/// L1 (Manhattan) distance — multi-versioned kernel.
93163
///
94-
/// AVX-512: load 16 of 17 i16 dims as i32, subtract, abs, horizontal sum.
95-
/// Last dim scalar. Total: ~3 instructions vs 17 scalar iterations.
164+
/// Runtime dispatch via LazyLock: AVX-512 → AVX2 → scalar.
165+
/// One binary serves all ISAs.
96166
#[inline]
97167
pub fn l1(&self, other: &Base17) -> u32 {
98-
#[cfg(target_arch = "x86_64")]
99-
{
100-
use crate::simd::I32x16;
101-
// Load 16 dims as i32 (sign-extend i16 → i32)
102-
let a: [i32; 16] = [
103-
self.dims[0] as i32, self.dims[1] as i32, self.dims[2] as i32, self.dims[3] as i32,
104-
self.dims[4] as i32, self.dims[5] as i32, self.dims[6] as i32, self.dims[7] as i32,
105-
self.dims[8] as i32, self.dims[9] as i32, self.dims[10] as i32, self.dims[11] as i32,
106-
self.dims[12] as i32, self.dims[13] as i32, self.dims[14] as i32, self.dims[15] as i32,
107-
];
108-
let b: [i32; 16] = [
109-
other.dims[0] as i32, other.dims[1] as i32, other.dims[2] as i32, other.dims[3] as i32,
110-
other.dims[4] as i32, other.dims[5] as i32, other.dims[6] as i32, other.dims[7] as i32,
111-
other.dims[8] as i32, other.dims[9] as i32, other.dims[10] as i32, other.dims[11] as i32,
112-
other.dims[12] as i32, other.dims[13] as i32, other.dims[14] as i32, other.dims[15] as i32,
113-
];
114-
let va = I32x16::from_array(a);
115-
let vb = I32x16::from_array(b);
116-
let diff = va - vb;
117-
let abs_diff = diff.abs();
118-
let sum16 = abs_diff.reduce_sum();
119-
// 17th dim scalar
120-
let d16 = (self.dims[16] as i32 - other.dims[16] as i32).unsigned_abs();
121-
sum16 as u32 + d16
122-
}
123-
#[cfg(not(target_arch = "x86_64"))]
124-
{
125-
let mut d = 0u32;
126-
for i in 0..BASE_DIM {
127-
d += (self.dims[i] as i32 - other.dims[i] as i32).unsigned_abs();
128-
}
129-
d
130-
}
168+
// SAFETY: LazyLock guarantees the selected kernel matches CPU features.
169+
unsafe { L1_KERNEL(&self.dims, &other.dims) }
131170
}
132171

133172
/// PCDVQ-informed L1: weight sign dimension 20x over mantissa.

0 commit comments

Comments
 (0)