perf: add explicit SIMD types and distance kernels for f64 (#6540)

justinrmiller · claude · web-flow · commit c913ff8f312b · 2026-04-16T18:57:01.000+08:00
## Summary - Adds `f64x4` and `f64x8` SIMD types to `lance-linalg` with support for x86_64 (AVX2/AVX-512), aarch64 (NEON), and loongarch64 (LASX) - Replaces auto-vectorization-dependent f64 distance functions with explicit SIMD using two-level unrolling (f64x8 + f64x4 + scalar tail) - Updates norm_l2, dot, L2, and cosine distance for f64 ## Benchmark Results (Apple M-series, aarch64 NEON) 1M vectors × 1024 dimensions: | Benchmark | Before | After | Change | |-----------|--------|-------|--------| | NormL2(f64, auto-vec) | 117.76 ms | 116.04 ms | ~same | | NormL2(f64, SIMD) | N/A (TODO) | 119.16 ms | new | | Dot(f64, auto-vec) | 129.36 ms | 130.23 ms | ~same | | L2(f64, auto-vec) | 132.53 ms | 135.15 ms | ~same | | **Cosine(f64, auto-vec)** | **202.52 ms** | **139.23 ms** | **-31.4%** | The biggest win is **cosine distance**, which previously had an empty `impl Cosine for f64 {}` falling back to the scalar path. The explicit SIMD implementation is **31% faster**. For norm_l2, dot, and L2, LLVM's auto-vectorization with the LANES=8 hint was already producing good code on this platform. The explicit SIMD ensures consistent performance across compilers and platforms rather than relying on fragile auto-vectorization hints. ## Test plan - [x] All 59 lance-linalg tests pass - [x] Clippy clean (`-D warnings`) - [x] `cargo fmt` clean - [ ] CI passes on all platforms 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/rust/lance-linalg/benches/norm_l2.rs b/rust/lance-linalg/benches/norm_l2.rs
@@ -13,7 +13,7 @@ use num_traits::Float;
 use rand::Rng;
 
 use lance_arrow::{ArrowFloatType, FloatArray, bfloat16::BFloat16Type};
-use lance_linalg::distance::{norm_l2, norm_l2_impl};
+use lance_linalg::distance::{norm_l2, norm_l2_f64_simd, norm_l2_impl};
 use lance_testing::datagen::generate_random_array_with_seed;
 
 #[cfg(target_os = "linux")]
@@ -106,7 +106,7 @@ fn bench_distance(c: &mut Criterion) {
         c,
         target.as_slice(),
         norm_l2_impl::<f64, f32, 8>,
-        None, // TODO: implement SIMD for f64
+        Some(norm_l2_f64_simd),
     );
 }
 
diff --git a/rust/lance-linalg/src/distance/cosine.rs b/rust/lance-linalg/src/distance/cosine.rs
@@ -230,7 +230,47 @@ impl Cosine for f32 {
     }
 }
 
-impl Cosine for f64 {}
+impl Cosine for f64 {
+    #[inline]
+    fn cosine_fast(x: &[Self], x_norm: f32, y: &[Self]) -> f32 {
+        use crate::simd::f64::{f64x4, f64x8};
+        use crate::simd::{FloatSimd, SIMD};
+
+        let dim = x.len();
+        let unrolled_len = dim / 8 * 8;
+        let mut y_norm8 = f64x8::zeros();
+        let mut xy8 = f64x8::zeros();
+        for i in (0..unrolled_len).step_by(8) {
+            unsafe {
+                let xv = f64x8::load_unaligned(x.as_ptr().add(i));
+                let yv = f64x8::load_unaligned(y.as_ptr().add(i));
+                xy8.multiply_add(xv, yv);
+                y_norm8.multiply_add(yv, yv);
+            }
+        }
+        let aligned_len = dim / 4 * 4;
+        let mut y_norm4 = f64x4::zeros();
+        let mut xy4 = f64x4::zeros();
+        for i in (unrolled_len..aligned_len).step_by(4) {
+            unsafe {
+                let xv = f64x4::load_unaligned(x.as_ptr().add(i));
+                let yv = f64x4::load_unaligned(y.as_ptr().add(i));
+                xy4.multiply_add(xv, yv);
+                y_norm4.multiply_add(yv, yv);
+            }
+        }
+        let tail_y_norm: Self = y[aligned_len..].iter().map(|&v| v * v).sum();
+        let tail_xy: Self = x[aligned_len..]
+            .iter()
+            .zip(y[aligned_len..].iter())
+            .map(|(&a, &b)| a * b)
+            .sum();
+
+        let y_norm_sq = (y_norm8.reduce_sum() + y_norm4.reduce_sum() + tail_y_norm) as f32;
+        let xy = (xy8.reduce_sum() + xy4.reduce_sum() + tail_xy) as f32;
+        1.0 - xy / x_norm / y_norm_sq.sqrt()
+    }
+}
 
 /// Fallback non-SIMD implementation
 #[inline]
diff --git a/rust/lance-linalg/src/distance/dot.rs b/rust/lance-linalg/src/distance/dot.rs
@@ -145,10 +145,47 @@ impl Dot for f32 {
 impl Dot for f64 {
     #[inline]
     fn dot(x: &[Self], y: &[Self]) -> f32 {
-        dot_scalar::<Self, Self, 8>(x, y) as f32
+        dot_f64_simd(x, y)
     }
 }
 
+/// Explicit SIMD dot product for f64.
+#[inline]
+fn dot_f64_simd(x: &[f64], y: &[f64]) -> f32 {
+    use crate::simd::f64::{f64x4, f64x8};
+    use crate::simd::{FloatSimd, SIMD};
+
+    let dim = x.len();
+    let unrolled_len = dim / 8 * 8;
+
+    let mut acc8 = f64x8::zeros();
+    for i in (0..unrolled_len).step_by(8) {
+        unsafe {
+            let a = f64x8::load_unaligned(x.as_ptr().add(i));
+            let b = f64x8::load_unaligned(y.as_ptr().add(i));
+            acc8.multiply_add(a, b);
+        }
+    }
+
+    let aligned_len = dim / 4 * 4;
+    let mut acc4 = f64x4::zeros();
+    for i in (unrolled_len..aligned_len).step_by(4) {
+        unsafe {
+            let a = f64x4::load_unaligned(x.as_ptr().add(i));
+            let b = f64x4::load_unaligned(y.as_ptr().add(i));
+            acc4.multiply_add(a, b);
+        }
+    }
+
+    let tail: f64 = x[aligned_len..]
+        .iter()
+        .zip(y[aligned_len..].iter())
+        .map(|(&a, &b)| a * b)
+        .sum();
+
+    (acc8.reduce_sum() + acc4.reduce_sum() + tail) as f32
+}
+
 impl Dot for u8 {
     #[inline]
     fn dot(x: &[Self], y: &[Self]) -> f32 {
diff --git a/rust/lance-linalg/src/distance/l2.rs b/rust/lance-linalg/src/distance/l2.rs
@@ -170,10 +170,52 @@ impl L2 for f32 {
 impl L2 for f64 {
     #[inline]
     fn l2(x: &[Self], y: &[Self]) -> f32 {
-        l2_scalar::<Self, Self, 8>(x, y) as f32
+        l2_f64_simd(x, y)
     }
 }
 
+/// Explicit SIMD L2 distance for f64.
+#[inline]
+fn l2_f64_simd(x: &[f64], y: &[f64]) -> f32 {
+    use crate::simd::f64::{f64x4, f64x8};
+    use crate::simd::{FloatSimd, SIMD};
+
+    let dim = x.len();
+    let unrolled_len = dim / 8 * 8;
+
+    let mut acc8 = f64x8::zeros();
+    for i in (0..unrolled_len).step_by(8) {
+        unsafe {
+            let a = f64x8::load_unaligned(x.as_ptr().add(i));
+            let b = f64x8::load_unaligned(y.as_ptr().add(i));
+            let diff = a - b;
+            acc8.multiply_add(diff, diff);
+        }
+    }
+
+    let aligned_len = dim / 4 * 4;
+    let mut acc4 = f64x4::zeros();
+    for i in (unrolled_len..aligned_len).step_by(4) {
+        unsafe {
+            let a = f64x4::load_unaligned(x.as_ptr().add(i));
+            let b = f64x4::load_unaligned(y.as_ptr().add(i));
+            let diff = a - b;
+            acc4.multiply_add(diff, diff);
+        }
+    }
+
+    let tail: f64 = x[aligned_len..]
+        .iter()
+        .zip(y[aligned_len..].iter())
+        .map(|(&a, &b)| {
+            let diff = a - b;
+            diff * diff
+        })
+        .sum();
+
+    (acc8.reduce_sum() + acc4.reduce_sum() + tail) as f32
+}
+
 /// Accumulate squared differences for one dimension into per-target results.
 ///
 /// Separated into its own function so that LLVM sees `row` and `result`
diff --git a/rust/lance-linalg/src/distance/norm_l2.rs b/rust/lance-linalg/src/distance/norm_l2.rs
@@ -97,10 +97,42 @@ impl Normalize for f32 {
 impl Normalize for f64 {
     #[inline]
     fn norm_l2(vector: &[Self]) -> f32 {
-        norm_l2_impl::<Self, Self, 8>(vector) as f32
+        norm_l2_f64_simd(vector)
     }
 }
 
+/// Explicit SIMD implementation of L2 norm for f64.
+///
+/// Two-level unrolling: f64x8 main loop, f64x4 remainder, scalar tail.
+#[inline]
+pub fn norm_l2_f64_simd(vector: &[f64]) -> f32 {
+    use crate::simd::f64::{f64x4, f64x8};
+    use crate::simd::{FloatSimd, SIMD};
+
+    let dim = vector.len();
+    let unrolled_len = dim / 8 * 8;
+
+    let mut acc8 = f64x8::zeros();
+    for i in (0..unrolled_len).step_by(8) {
+        unsafe {
+            let v = f64x8::load_unaligned(vector.as_ptr().add(i));
+            acc8.multiply_add(v, v);
+        }
+    }
+
+    let aligned_len = dim / 4 * 4;
+    let mut acc4 = f64x4::zeros();
+    for i in (unrolled_len..aligned_len).step_by(4) {
+        unsafe {
+            let v = f64x4::load_unaligned(vector.as_ptr().add(i));
+            acc4.multiply_add(v, v);
+        }
+    }
+
+    let tail: f64 = vector[aligned_len..].iter().map(|&v| v * v).sum();
+    (acc8.reduce_sum() + acc4.reduce_sum() + tail).sqrt() as f32
+}
+
 /// NOTE: this is only pub for benchmarking purposes
 #[inline]
 pub fn norm_l2_impl<
diff --git a/rust/lance-linalg/src/simd.rs b/rust/lance-linalg/src/simd.rs
@@ -16,6 +16,7 @@ use std::ops::{Add, AddAssign, Mul, Sub, SubAssign};
 
 pub mod dist_table;
 pub mod f32;
+pub mod f64;
 pub mod i32;
 pub mod u8;
 
diff --git a/rust/lance-linalg/src/simd/f64.rs b/rust/lance-linalg/src/simd/f64.rs