refactor(blasgraph): route Hamming/popcount through ndarray::hpc::bitwise

claude · claude · commit 0eaaa86c5209 · 2026-06-14T06:40:34.000Z
Under `ndarray-hpc`, `dispatch_hamming`/`dispatch_popcount` and the typed `hamming_distance_dispatch` now call `ndarray::hpc::bitwise::{hamming_distance_raw, popcount_raw}` (the canonical VPOPCNTDQ → AVX-512BW → AVX2 → scalar dispatch), per the "all SIMD from ndarray" doctrine. The hand-rolled in-crate intrinsics survive only as the `#[cfg(not(feature = "ndarray-hpc"))]` fallback for minimal / non-x86 builds (CI, wasm, embedded). Mirrors the episodic.rs pattern. Validated: `cargo test -p lance-graph --lib blasgraph` → 194 passed, 0 failed (protoc installed to unblock the lance-encoding build script). https://claude.ai/code/session_01D2WSmezQBNC3bUdHuGfGmo
diff --git a/crates/lance-graph/src/graph/blasgraph/ndarray_bridge.rs b/crates/lance-graph/src/graph/blasgraph/ndarray_bridge.rs
@@ -128,60 +128,80 @@ impl From<&ndarray::hpc::fingerprint::Fingerprint<256>> for BitVec {
 }
 
 // ---------------------------------------------------------------------------
-// SIMD dispatch — 4-tier fallback matching ndarray's bitwise.rs pattern
+// SIMD dispatch — routed through ndarray under `ndarray-hpc`, else 4-tier
+// in-crate fallback. Per the "all SIMD from ndarray" doctrine the canonical
+// SIMD dispatch lives in `ndarray::hpc::bitwise`; the hand-rolled intrinsics
+// below survive only as the `#[cfg(not(feature = "ndarray-hpc"))]` fallback
+// for minimal / non-x86 builds (CI, wasm, embedded).
 // ---------------------------------------------------------------------------
 
 /// SIMD-dispatched Hamming distance between two byte slices.
 ///
-/// Computes `popcount(a XOR b)` using the best available instruction set:
-///
-/// 1. **VPOPCNTDQ** (AVX-512 VPOPCNTDQ) — 512-bit popcount in one instruction
-/// 2. **AVX-512BW** — 512-bit XOR + byte-level popcount via shuffle LUT
-/// 3. **AVX2** — 256-bit XOR + byte-level popcount via shuffle LUT
-/// 4. **Scalar** — word-by-word `count_ones()`
+/// Computes `popcount(a XOR b)`. Under `ndarray-hpc` this routes through
+/// `ndarray::hpc::bitwise::hamming_distance_raw` (VPOPCNTDQ → AVX-512BW →
+/// AVX2 → scalar). Without the feature it uses the in-crate 4-tier fallback
+/// (VPOPCNTDQ → AVX-512BW → AVX2 → scalar).
 ///
 /// Both slices must have the same length. Panics otherwise.
 pub fn dispatch_hamming(a: &[u8], b: &[u8]) -> u64 {
     assert_eq!(a.len(), b.len(), "hamming: slices must have equal length");
 
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(feature = "ndarray-hpc")]
     {
-        if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") {
-            // SAFETY: feature detection guarantees VPOPCNTDQ is available.
-            return unsafe { hamming_avx512_vpopcntdq(a, b) };
-        }
-        if is_x86_feature_detected!("avx512bw") && is_x86_feature_detected!("avx512f") {
-            // SAFETY: feature detection guarantees AVX-512BW is available.
-            return unsafe { hamming_avx512bw(a, b) };
-        }
-        if is_x86_feature_detected!("avx2") {
-            // SAFETY: feature detection guarantees AVX2 is available.
-            return unsafe { hamming_avx2(a, b) };
-        }
+        // Lengths are equal (asserted above), so ndarray's `min(len)` is exact.
+        ndarray::hpc::bitwise::hamming_distance_raw(a, b)
     }
 
-    hamming_scalar(a, b)
+    #[cfg(not(feature = "ndarray-hpc"))]
+    {
+        #[cfg(target_arch = "x86_64")]
+        {
+            if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") {
+                // SAFETY: feature detection guarantees VPOPCNTDQ is available.
+                return unsafe { hamming_avx512_vpopcntdq(a, b) };
+            }
+            if is_x86_feature_detected!("avx512bw") && is_x86_feature_detected!("avx512f") {
+                // SAFETY: feature detection guarantees AVX-512BW is available.
+                return unsafe { hamming_avx512bw(a, b) };
+            }
+            if is_x86_feature_detected!("avx2") {
+                // SAFETY: feature detection guarantees AVX2 is available.
+                return unsafe { hamming_avx2(a, b) };
+            }
+        }
+
+        hamming_scalar(a, b)
+    }
 }
 
 /// SIMD-dispatched population count over a byte slice.
 ///
-/// Uses the same 4-tier fallback as `dispatch_hamming`:
-/// VPOPCNTDQ -> AVX-512BW -> AVX2 -> scalar.
+/// Under `ndarray-hpc` this routes through `ndarray::hpc::bitwise::
+/// popcount_raw`. Without the feature it uses the same in-crate 4-tier
+/// fallback as `dispatch_hamming` (VPOPCNTDQ → AVX-512BW → AVX2 → scalar).
 pub fn dispatch_popcount(a: &[u8]) -> u64 {
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(feature = "ndarray-hpc")]
     {
-        if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") {
-            return unsafe { popcount_avx512_vpopcntdq(a) };
-        }
-        if is_x86_feature_detected!("avx512bw") && is_x86_feature_detected!("avx512f") {
-            return unsafe { popcount_avx512bw(a) };
-        }
-        if is_x86_feature_detected!("avx2") {
-            return unsafe { popcount_avx2(a) };
-        }
+        ndarray::hpc::bitwise::popcount_raw(a)
     }
 
-    popcount_scalar(a)
+    #[cfg(not(feature = "ndarray-hpc"))]
+    {
+        #[cfg(target_arch = "x86_64")]
+        {
+            if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") {
+                return unsafe { popcount_avx512_vpopcntdq(a) };
+            }
+            if is_x86_feature_detected!("avx512bw") && is_x86_feature_detected!("avx512f") {
+                return unsafe { popcount_avx512bw(a) };
+            }
+            if is_x86_feature_detected!("avx2") {
+                return unsafe { popcount_avx2(a) };
+            }
+        }
+
+        popcount_scalar(a)
+    }
 }
 
 // ---------------------------------------------------------------------------
diff --git a/crates/lance-graph/src/graph/blasgraph/types.rs b/crates/lance-graph/src/graph/blasgraph/types.rs
@@ -420,10 +420,16 @@ pub enum SelectOp {
 
 // ─── SIMD-dispatched Hamming distance ─────────────────────────────────
 //
-// Dispatch chain: AVX-512 VPOPCNTDQ → AVX2 → scalar.
-// Uses `std::arch` intrinsics only, no external crate.
+// Per the "all SIMD from ndarray" doctrine, the SIMD dispatch lives in
+// `ndarray::hpc::bitwise` (VPOPCNTDQ → AVX-512BW → AVX2 → scalar) and is
+// routed in under the `ndarray-hpc` feature. The hand-rolled scalar path
+// below is the `#[cfg(not(feature = "ndarray-hpc"))]` fallback so minimal /
+// non-x86 builds (CI, wasm, embedded) keep working without the dep.
 
 /// Scalar fallback: portable popcount via `count_ones()`.
+///
+/// Used as the `#[cfg(not(feature = "ndarray-hpc"))]` Hamming path and by the
+/// in-crate parity tests.
 fn hamming_distance_scalar(a: &[u64; VECTOR_WORDS], b: &[u64; VECTOR_WORDS]) -> u32 {
     let mut dist = 0u32;
     for i in 0..VECTOR_WORDS {
@@ -432,85 +438,31 @@ fn hamming_distance_scalar(a: &[u64; VECTOR_WORDS], b: &[u64; VECTOR_WORDS]) ->
     dist
 }
 
-/// AVX2 implementation: processes 4 × u64 = 256 bits per iteration.
-/// Uses the Harley-Seal popcount algorithm on 256-bit XOR results.
-#[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "avx2")]
-unsafe fn hamming_distance_avx2(a: &[u64; VECTOR_WORDS], b: &[u64; VECTOR_WORDS]) -> u32 {
-    use std::arch::x86_64::*;
-
-    // Lookup table for 4-bit popcount
-    let lookup = _mm256_setr_epi8(
-        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3,
-        3, 4,
-    );
-    let low_mask = _mm256_set1_epi8(0x0f);
-    let mut total = _mm256_setzero_si256();
-
-    let a_ptr = a.as_ptr() as *const __m256i;
-    let b_ptr = b.as_ptr() as *const __m256i;
-    let n_vecs = VECTOR_WORDS / 4; // 256 / 4 = 64 iterations
-
-    for i in 0..n_vecs {
-        let va = _mm256_loadu_si256(a_ptr.add(i));
-        let vb = _mm256_loadu_si256(b_ptr.add(i));
-        let xor = _mm256_xor_si256(va, vb);
-
-        // Popcount via lookup table (Mula et al.)
-        let lo = _mm256_and_si256(xor, low_mask);
-        let hi = _mm256_and_si256(_mm256_srli_epi16(xor, 4), low_mask);
-        let popcnt_lo = _mm256_shuffle_epi8(lookup, lo);
-        let popcnt_hi = _mm256_shuffle_epi8(lookup, hi);
-        let popcnt = _mm256_add_epi8(popcnt_lo, popcnt_hi);
-
-        // Horizontal sum within bytes → u64 sums via sad
-        let sad = _mm256_sad_epu8(popcnt, _mm256_setzero_si256());
-        total = _mm256_add_epi64(total, sad);
-    }
-
-    // Extract and sum the 4 u64 lanes
-    let lo128 = _mm256_castsi256_si128(total);
-    let hi128 = _mm256_extracti128_si256(total, 1);
-    let sum128 = _mm_add_epi64(lo128, hi128);
-    let upper = _mm_unpackhi_epi64(sum128, sum128);
-    let final_sum = _mm_add_epi64(sum128, upper);
-    _mm_cvtsi128_si64(final_sum) as u32
-}
-
-/// AVX-512 VPOPCNTDQ implementation: processes 8 × u64 = 512 bits per iteration.
-#[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "avx512f,avx512vpopcntdq")]
-unsafe fn hamming_distance_avx512(a: &[u64; VECTOR_WORDS], b: &[u64; VECTOR_WORDS]) -> u32 {
-    use std::arch::x86_64::*;
-
-    let mut total = _mm512_setzero_si512();
-    let a_ptr = a.as_ptr() as *const __m512i;
-    let b_ptr = b.as_ptr() as *const __m512i;
-    let n_vecs = VECTOR_WORDS / 8; // 256 / 8 = 32 iterations
-
-    for i in 0..n_vecs {
-        let va = _mm512_loadu_si512(a_ptr.add(i));
-        let vb = _mm512_loadu_si512(b_ptr.add(i));
-        let xor = _mm512_xor_si512(va, vb);
-        let popcnt = _mm512_popcnt_epi64(xor);
-        total = _mm512_add_epi64(total, popcnt);
+/// Runtime-dispatched Hamming distance.
+///
+/// Under `ndarray-hpc` this routes through `ndarray::hpc::bitwise::
+/// hamming_distance_raw` (the canonical SIMD dispatch shared with the rest of
+/// the Ada stack). Without the feature it falls back to the in-crate scalar
+/// path. Both views reinterpret the same `[u64; VECTOR_WORDS]` backing store
+/// as native-endian bytes; Hamming distance is a bit count and is therefore
+/// invariant under the (consistent) byte layout on both operands.
+fn hamming_distance_dispatch(a: &[u64; VECTOR_WORDS], b: &[u64; VECTOR_WORDS]) -> u32 {
+    #[cfg(feature = "ndarray-hpc")]
+    {
+        const BYTE_LEN: usize = VECTOR_WORDS * 8;
+        // SAFETY: `[u64; VECTOR_WORDS]` is plain-old-data with no padding; a
+        // `&[u8]` view of the same `BYTE_LEN` bytes is always valid (u8 has
+        // alignment 1). Same layout on both operands → bit count is exact.
+        let a_bytes = unsafe { std::slice::from_raw_parts(a.as_ptr() as *const u8, BYTE_LEN) };
+        let b_bytes = unsafe { std::slice::from_raw_parts(b.as_ptr() as *const u8, BYTE_LEN) };
+        // Max distance is VECTOR_BITS (16384), well within u32.
+        ndarray::hpc::bitwise::hamming_distance_raw(a_bytes, b_bytes) as u32
     }
 
-    _mm512_reduce_add_epi64(total) as u32
-}
-
-/// Runtime-dispatched Hamming distance using best available SIMD.
-fn hamming_distance_dispatch(a: &[u64; VECTOR_WORDS], b: &[u64; VECTOR_WORDS]) -> u32 {
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(not(feature = "ndarray-hpc"))]
     {
-        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vpopcntdq") {
-            return unsafe { hamming_distance_avx512(a, b) };
-        }
-        if is_x86_feature_detected!("avx2") {
-            return unsafe { hamming_distance_avx2(a, b) };
-        }
+        hamming_distance_scalar(a, b)
     }
-    hamming_distance_scalar(a, b)
 }
 
 #[cfg(test)]