perf: add SIMD kernels for bf16 distance functions (#6510)

justinrmiller · claude · web-flow · commit d0124edf6257 · 2026-04-16T05:27:54.000-07:00
## Summary - Replaces the external `numkong` dependency with in-tree C kernels for **bf16 distance computation** (dot product, L2, cosine, norm_l2) - Follows the existing f16 kernel pattern: C source compiled via `build.rs` with per-architecture flags, runtime CPU dispatch via `SIMD_SUPPORT` - Kernels are only enabled when the CPU supports the required instructions (NEON on aarch64, AVX2/AVX-512 on x86_64, LSX/LASX on loongarch64), with scalar fallback otherwise - Gated behind the existing `fp16kernels` feature flag ## Benchmark Results Tested on two platforms with 1M x 1024-dim vectors: ### Apple Silicon (M-series, NEON) | Benchmark | Before (scalar) | After (C kernel) | Change | |-----------|-----------------|-------------------|--------| | **Dot(bf16)** | 144 ms | 55 ms | **2.6x faster** | | **NormL2(bf16)** | 90 ms | 36 ms | **2.5x faster** | ### AMD Ryzen 5 4500 (Zen 2, AVX2) | Benchmark | Before (scalar) | After (C kernel) | Change | |-----------|-----------------|-------------------|--------| | **Dot(bf16)** | 578 ms | 363 ms | **1.6x faster** (−37%) | | **NormL2(bf16)** | 365 ms | 207 ms | **1.8x faster** (−43%) | ### Why the approach works BF16-to-f32 conversion is a simple left-shift by 16 bits. The C kernels compiled with architecture-specific flags (`-march=haswell`, `-mtune=apple-m1`, etc.) plus `-ffast-math` and vectorization pragmas give the compiler more freedom to emit tight SIMD code than LLVM gets from the Rust scalar loops. ARM benefits more because the baseline Rust auto-vectorization was weaker there. ## Files Changed - **New**: `rust/lance-linalg/src/simd/bf16.c` — C kernels for dot, L2, cosine, norm_l2 - `rust/lance-linalg/build.rs` — compile bf16.c for each architecture - `rust/lance-linalg/src/distance/{dot,l2,cosine,norm_l2}.rs` — runtime SIMD dispatch for bf16 - `rust/lance-linalg/Cargo.toml` — removed `numkong` dependency and feature - `rust/lance-linalg/benches/{dot,l2,cosine}.rs` — removed numkong benchmark sections - **Deleted**: `scripts/bench_numkong.sh` ## Test plan - [x] `cargo test -p lance-linalg --features fp16kernels` — all bf16 tests pass (kernel path) - [x] `cargo test -p lance-linalg` — all bf16 tests pass (scalar fallback) - [x] `cargo clippy -p lance-linalg --features fp16kernels --tests --benches -- -D warnings` — clean - [x] Benchmarked on Apple Silicon (ARM NEON) - [x] Benchmarked on AMD Ryzen 5 4500 (x86_64 AVX2) - To reproduce: ```bash git checkout HEAD~1 TARGET_TIME=3 cargo bench -p lance-linalg --features fp16kernels --bench dot -- --save-baseline before "bf16" TARGET_TIME=3 cargo bench -p lance-linalg --features fp16kernels --bench norm_l2 -- --save-baseline before "bf16" git checkout - TARGET_TIME=3 cargo bench -p lance-linalg --features fp16kernels --bench dot -- --baseline before "bf16" TARGET_TIME=3 cargo bench -p lance-linalg --features fp16kernels --bench norm_l2 -- --baseline before "bf16" ``` 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/rust/lance-linalg/build.rs b/rust/lance-linalg/build.rs
@@ -19,6 +19,7 @@ fn main() -> Result<(), String> {
     println!("cargo::rustc-check-cfg=cfg(kernel_support, values(\"avx512\"))");
 
     println!("cargo:rerun-if-changed=src/simd/f16.c");
+    println!("cargo:rerun-if-changed=src/simd/bf16.c");
     println!("cargo:rerun-if-changed=src/simd/dist_table.c");
 
     // Important: we don't use `cfg!(target_arch)` here because that is the target_arch
@@ -37,13 +38,16 @@ fn main() -> Result<(), String> {
     if target_arch == "aarch64" && target_os == "macos" {
         // Build a version with NEON
         build_f16_with_flags("neon", &["-mtune=apple-m1"]).unwrap();
+        build_bf16_with_flags("neon", &["-mtune=apple-m1"]).unwrap();
     } else if target_arch == "aarch64" && target_os == "ios" {
         // Build version with NEON
         // A13 bionic is the earliest supported iOS SOC
         build_f16_with_flags("neon", &["-mtune=apple-a13"]).unwrap();
+        build_bf16_with_flags("neon", &["-mtune=apple-a13"]).unwrap();
     } else if target_arch == "aarch64" && (target_os == "linux" || target_os == "android") {
         // Build a version with NEON
         build_f16_with_flags("neon", &["-march=armv8.2-a+fp16"]).unwrap();
+        build_bf16_with_flags("neon", &["-march=armv8.2-a+fp16"]).unwrap();
     } else if target_arch == "x86_64" {
         // Build a version with AVX512
         if let Err(err) = build_f16_with_flags("avx512", &["-march=sapphirerapids", "-mavx512fp16"])
@@ -59,6 +63,17 @@ fn main() -> Result<(), String> {
             // generated the AVX512 version of the f16 kernels.
             println!("cargo:rustc-cfg=kernel_support=\"avx512\"");
         };
+        // Build AVX-512 bf16 kernels (sapphirerapids has native vdpbf16ps)
+        if let Err(err) =
+            build_bf16_with_flags("avx512", &["-march=sapphirerapids", "-mavx512fp16"])
+        {
+            println!(
+                "cargo:warning=Skipping build of AVX-512 bf16 kernels. Error: {}",
+                err
+            );
+        } else {
+            println!("cargo:rustc-cfg=kernel_support=\"avx512\"");
+        };
         if let Err(err) = build_dist_table_with_flags("avx512", &["-march=native"]) {
             println!(
                 "cargo:warning=Skipping build of AVX-512 dist_table. Error: {}",
@@ -77,11 +92,20 @@ fn main() -> Result<(), String> {
                 err
             ));
         };
+        // Build AVX2 bf16 kernels (bf16-to-f32 is just a shift, auto-vectorizes well)
+        if let Err(err) = build_bf16_with_flags("avx2", &["-march=haswell"]) {
+            return Err(format!(
+                "Unable to build AVX2 bf16 kernels.  Received error: {}",
+                err
+            ));
+        };
         // There is no SSE instruction set for f16 -> f32 float conversion
     } else if target_arch == "loongarch64" {
         // Build a version with LSX and LASX
         build_f16_with_flags("lsx", &["-mlsx"]).unwrap();
         build_f16_with_flags("lasx", &["-mlasx"]).unwrap();
+        build_bf16_with_flags("lsx", &["-mlsx"]).unwrap();
+        build_bf16_with_flags("lasx", &["-mlasx"]).unwrap();
     } else {
         // Only error if fp16kernels was explicitly requested on unsupported platform.
         // This allows builds on iOS, Android, etc. when the feature is disabled.
@@ -128,6 +152,32 @@ fn build_f16_with_flags(suffix: &str, flags: &[&str]) -> Result<(), cc::Error> {
     builder.try_compile(&format!("f16_{}", suffix))
 }
 
+fn build_bf16_with_flags(suffix: &str, flags: &[&str]) -> Result<(), cc::Error> {
+    if cfg!(not(feature = "fp16kernels")) {
+        println!(
+            "cargo:warning=fp16kernels feature is not enabled, skipping build of bf16 kernels"
+        );
+        return Ok(());
+    }
+
+    let mut builder = cc::Build::new();
+    builder
+        .std("c17")
+        .file("src/simd/bf16.c")
+        .flag("-ffast-math")
+        .flag("-funroll-loops")
+        .flag("-O3")
+        .flag("-Wall")
+        .flag("-Wextra")
+        .flag(format!("-DSUFFIX=_{}", suffix).as_str());
+
+    for flag in flags {
+        builder.flag(flag);
+    }
+
+    builder.try_compile(&format!("bf16_{}", suffix))
+}
+
 fn build_dist_table_with_flags(suffix: &str, flags: &[&str]) -> Result<(), cc::Error> {
     let mut builder = cc::Build::new();
     builder
diff --git a/rust/lance-linalg/src/distance/cosine.rs b/rust/lance-linalg/src/distance/cosine.rs
@@ -72,7 +72,65 @@ impl Cosine for u8 {
     }
 }
 
-impl Cosine for bf16 {}
+#[cfg(feature = "fp16kernels")]
+mod bf16_kernel {
+    use half::bf16;
+
+    // These are the `cosine_bf16` function in bf16.c. Our build.rs script compiles
+    // a version of this file for each SIMD level with different suffixes.
+    unsafe extern "C" {
+        #[cfg(target_arch = "aarch64")]
+        pub fn cosine_bf16_neon(x: *const bf16, x_norm: f32, y: *const bf16, dimension: u32)
+        -> f32;
+        #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))]
+        pub fn cosine_bf16_avx512(
+            x: *const bf16,
+            x_norm: f32,
+            y: *const bf16,
+            dimension: u32,
+        ) -> f32;
+        #[cfg(target_arch = "x86_64")]
+        pub fn cosine_bf16_avx2(x: *const bf16, x_norm: f32, y: *const bf16, dimension: u32)
+        -> f32;
+        #[cfg(target_arch = "loongarch64")]
+        pub fn cosine_bf16_lsx(x: *const bf16, x_norm: f32, y: *const bf16, dimension: u32) -> f32;
+        #[cfg(target_arch = "loongarch64")]
+        pub fn cosine_bf16_lasx(x: *const bf16, x_norm: f32, y: *const bf16, dimension: u32)
+        -> f32;
+    }
+}
+
+impl Cosine for bf16 {
+    fn cosine_fast(x: &[Self], x_norm: f32, y: &[Self]) -> f32 {
+        match *SIMD_SUPPORT {
+            #[cfg(all(feature = "fp16kernels", target_arch = "aarch64"))]
+            SimdSupport::Neon => unsafe {
+                bf16_kernel::cosine_bf16_neon(x.as_ptr(), x_norm, y.as_ptr(), y.len() as u32)
+            },
+            #[cfg(all(
+                feature = "fp16kernels",
+                kernel_support = "avx512",
+                target_arch = "x86_64"
+            ))]
+            SimdSupport::Avx512FP16 => unsafe {
+                bf16_kernel::cosine_bf16_avx512(x.as_ptr(), x_norm, y.as_ptr(), y.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "x86_64"))]
+            SimdSupport::Avx2 | SimdSupport::Avx512 => unsafe {
+                bf16_kernel::cosine_bf16_avx2(x.as_ptr(), x_norm, y.as_ptr(), y.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "loongarch64"))]
+            SimdSupport::Lasx => unsafe {
+                bf16_kernel::cosine_bf16_lasx(x.as_ptr(), x_norm, y.as_ptr(), y.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "loongarch64"))]
+            SimdSupport::Lsx => unsafe {
+                bf16_kernel::cosine_bf16_lsx(x.as_ptr(), x_norm, y.as_ptr(), y.len() as u32)
+            },
+            _ => cosine_scalar(x, x_norm, y),
+        }
+    }
+}
 
 #[cfg(feature = "fp16kernels")]
 mod kernel {
diff --git a/rust/lance-linalg/src/distance/dot.rs b/rust/lance-linalg/src/distance/dot.rs
@@ -75,10 +75,56 @@ pub trait Dot: Num {
     fn dot(x: &[Self], y: &[Self]) -> f32;
 }
 
+#[cfg(feature = "fp16kernels")]
+mod bf16_kernel {
+    use half::bf16;
+
+    // These are the `dot_bf16` function in bf16.c. Our build.rs script compiles
+    // a version of this file for each SIMD level with different suffixes.
+    unsafe extern "C" {
+        #[cfg(target_arch = "aarch64")]
+        pub fn dot_bf16_neon(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+        #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))]
+        pub fn dot_bf16_avx512(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+        #[cfg(target_arch = "x86_64")]
+        pub fn dot_bf16_avx2(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+        #[cfg(target_arch = "loongarch64")]
+        pub fn dot_bf16_lsx(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+        #[cfg(target_arch = "loongarch64")]
+        pub fn dot_bf16_lasx(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+    }
+}
+
 impl Dot for bf16 {
     #[inline]
     fn dot(x: &[Self], y: &[Self]) -> f32 {
-        dot_scalar::<Self, f32, 32>(x, y)
+        match *SIMD_SUPPORT {
+            #[cfg(all(feature = "fp16kernels", target_arch = "aarch64"))]
+            SimdSupport::Neon => unsafe {
+                bf16_kernel::dot_bf16_neon(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            #[cfg(all(
+                feature = "fp16kernels",
+                kernel_support = "avx512",
+                target_arch = "x86_64"
+            ))]
+            SimdSupport::Avx512FP16 => unsafe {
+                bf16_kernel::dot_bf16_avx512(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "x86_64"))]
+            SimdSupport::Avx2 | SimdSupport::Avx512 => unsafe {
+                bf16_kernel::dot_bf16_avx2(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "loongarch64"))]
+            SimdSupport::Lasx => unsafe {
+                bf16_kernel::dot_bf16_lasx(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "loongarch64"))]
+            SimdSupport::Lsx => unsafe {
+                bf16_kernel::dot_bf16_lsx(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            _ => dot_scalar::<Self, f32, 32>(x, y),
+        }
     }
 }
 
diff --git a/rust/lance-linalg/src/distance/l2.rs b/rust/lance-linalg/src/distance/l2.rs
@@ -97,11 +97,56 @@ impl L2 for u8 {
     }
 }
 
+#[cfg(feature = "fp16kernels")]
+mod bf16_kernel {
+    use half::bf16;
+
+    // These are the `l2_bf16` function in bf16.c. Our build.rs script compiles
+    // a version of this file for each SIMD level with different suffixes.
+    unsafe extern "C" {
+        #[cfg(target_arch = "aarch64")]
+        pub fn l2_bf16_neon(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+        #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))]
+        pub fn l2_bf16_avx512(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+        #[cfg(target_arch = "x86_64")]
+        pub fn l2_bf16_avx2(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+        #[cfg(target_arch = "loongarch64")]
+        pub fn l2_bf16_lsx(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+        #[cfg(target_arch = "loongarch64")]
+        pub fn l2_bf16_lasx(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32;
+    }
+}
+
 impl L2 for bf16 {
     #[inline]
     fn l2(x: &[Self], y: &[Self]) -> f32 {
-        // TODO: add SIMD support
-        l2_scalar::<Self, f32, 16>(x, y)
+        match *SIMD_SUPPORT {
+            #[cfg(all(feature = "fp16kernels", target_arch = "aarch64"))]
+            SimdSupport::Neon => unsafe {
+                bf16_kernel::l2_bf16_neon(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            #[cfg(all(
+                feature = "fp16kernels",
+                kernel_support = "avx512",
+                target_arch = "x86_64"
+            ))]
+            SimdSupport::Avx512FP16 => unsafe {
+                bf16_kernel::l2_bf16_avx512(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "x86_64"))]
+            SimdSupport::Avx2 | SimdSupport::Avx512 => unsafe {
+                bf16_kernel::l2_bf16_avx2(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "loongarch64"))]
+            SimdSupport::Lasx => unsafe {
+                bf16_kernel::l2_bf16_lasx(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "loongarch64"))]
+            SimdSupport::Lsx => unsafe {
+                bf16_kernel::l2_bf16_lsx(x.as_ptr(), y.as_ptr(), x.len() as u32)
+            },
+            _ => l2_scalar::<Self, f32, 16>(x, y),
+        }
     }
 }
 
diff --git a/rust/lance-linalg/src/distance/norm_l2.rs b/rust/lance-linalg/src/distance/norm_l2.rs
@@ -80,10 +80,54 @@ impl Normalize for f16 {
     }
 }
 
+#[cfg(feature = "fp16kernels")]
+mod bf16_kernel {
+    use half::bf16;
+
+    unsafe extern "C" {
+        #[cfg(target_arch = "aarch64")]
+        pub fn norm_l2_bf16_neon(ptr: *const bf16, len: u32) -> f32;
+        #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))]
+        pub fn norm_l2_bf16_avx512(ptr: *const bf16, len: u32) -> f32;
+        #[cfg(target_arch = "x86_64")]
+        pub fn norm_l2_bf16_avx2(ptr: *const bf16, len: u32) -> f32;
+        #[cfg(target_arch = "loongarch64")]
+        pub fn norm_l2_bf16_lsx(ptr: *const bf16, len: u32) -> f32;
+        #[cfg(target_arch = "loongarch64")]
+        pub fn norm_l2_bf16_lasx(ptr: *const bf16, len: u32) -> f32;
+    }
+}
+
 impl Normalize for bf16 {
     #[inline]
     fn norm_l2(vector: &[Self]) -> f32 {
-        norm_l2_impl::<Self, f32, 32>(vector)
+        match *SIMD_SUPPORT {
+            #[cfg(all(feature = "fp16kernels", target_arch = "aarch64"))]
+            SimdSupport::Neon => unsafe {
+                bf16_kernel::norm_l2_bf16_neon(vector.as_ptr(), vector.len() as u32)
+            },
+            #[cfg(all(
+                feature = "fp16kernels",
+                kernel_support = "avx512",
+                target_arch = "x86_64"
+            ))]
+            SimdSupport::Avx512FP16 => unsafe {
+                bf16_kernel::norm_l2_bf16_avx512(vector.as_ptr(), vector.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "x86_64"))]
+            SimdSupport::Avx2 | SimdSupport::Avx512 => unsafe {
+                bf16_kernel::norm_l2_bf16_avx2(vector.as_ptr(), vector.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "loongarch64"))]
+            SimdSupport::Lasx => unsafe {
+                bf16_kernel::norm_l2_bf16_lasx(vector.as_ptr(), vector.len() as u32)
+            },
+            #[cfg(all(feature = "fp16kernels", target_arch = "loongarch64"))]
+            SimdSupport::Lsx => unsafe {
+                bf16_kernel::norm_l2_bf16_lsx(vector.as_ptr(), vector.len() as u32)
+            },
+            _ => norm_l2_impl::<Self, f32, 32>(vector),
+        }
     }
 }
 
diff --git a/rust/lance-linalg/src/simd/bf16.c b/rust/lance-linalg/src/simd/bf16.c
diff --git a/rust/lance-linalg/src/simd/f16.c b/rust/lance-linalg/src/simd/f16.c