feat(hpc): SIMD wishlist items #1, #5, #6, #10

claude · claude · commit efacb94108cd · 2026-03-23T11:43:12.000Z
#1 VML wiring: Wire F32x16/F64x8 SIMD types into scalar VML loops - vsln: 16-wide via simd_ln_f32 - vdsqrt: 8-wide via F64x8::sqrt() - vdabs: 8-wide via F64x8::abs() - vssin/vscos: batch load/store via F32x16 (scalar per-lane, SIMD framework) - vspow: 16-wide via exp(b*ln(a)) using simd_exp_f32 + simd_ln_f32 - 7 new tests covering SIMD paths #5 columnar_view: Zero-copy Arrow interop - SoakingBuffer::as_columnar_slice() / as_columnar_slice_mut() - PlaneBuffer::as_binary_slice() - 3 new tests for zero-copy view correctness #6 simd_apply: Generic fused SIMD kernel - simd_apply(a, b, out, Fn(F32x16, F32x16) -> F32x16) - simd_apply_unary(x, out, Fn(F32x16) -> F32x16) - simd_apply_inplace(a, b, Fn(F32x16, F32x16) -> F32x16) - Proper tail handling via zero-padded SIMD - 6 new tests (add, FMA, sqrt, inplace, empty, tail-only) #10 prefetch: Explicit _mm_prefetch in cascade_query - prefetch_t0/t1 wrappers (x86_64 SSE, no-op elsewhere) - Stroke 1: prefetch PREFETCH_DISTANCE=4 candidates ahead (L1) - Stroke 2: prefetch next survivor's data (L2) - Stroke 3: prefetch next survivor's data (L1) All 51 targeted tests pass. Scorecard: 4/10 → 6/10 done. https://claude.ai/code/session_01CdqyUTUfjKZuk8YGJzv6LB
diff --git a/src/hpc/arrow_bridge.rs b/src/hpc/arrow_bridge.rs
@@ -115,6 +115,22 @@ impl SoakingBuffer {
         &mut self.data[start..start + self.n_dims]
     }
 
+    /// Zero-copy columnar view of the entire soaking buffer as a flat `&[i8]` slice.
+    ///
+    /// Layout: `n_entries × n_dims` row-major. Use with
+    /// `ArrayView2::from_shape((n_entries, n_dims), buf.as_columnar_slice())`
+    /// for zero-copy ndarray interop.
+    #[inline]
+    pub fn as_columnar_slice(&self) -> &[i8] {
+        &self.data
+    }
+
+    /// Mutable zero-copy view of the entire soaking buffer.
+    #[inline]
+    pub fn as_columnar_slice_mut(&mut self) -> &mut [i8] {
+        &mut self.data
+    }
+
     /// Crystallize: convert soaking (int8) to binary fingerprint via sign().
     pub fn crystallize(&self, idx: usize) -> Vec<u8> {
         let entry = self.entry(idx);
@@ -162,6 +178,16 @@ impl PlaneBuffer {
         let start = idx * self.binary_bytes;
         &mut self.binary[start..start + self.binary_bytes]
     }
+
+    /// Zero-copy columnar view of all binary fingerprints as a flat `&[u8]` slice.
+    ///
+    /// Layout: `n_entries × binary_bytes` row-major. Use with
+    /// `ArrayView2::from_shape((n_entries, binary_bytes), buf.as_binary_slice())`
+    /// for zero-copy ndarray interop.
+    #[inline]
+    pub fn as_binary_slice(&self) -> &[u8] {
+        &self.binary
+    }
 }
 
 /// Three-plane fingerprint buffer (S, P, O).
@@ -928,4 +954,32 @@ mod tests {
         let dist_raw = hamming_distance_raw(&node.subject_binary, &node.predicate_binary);
         assert_eq!(dist_fp as u64, dist_raw);
     }
+
+    // --- columnar_view tests ---
+
+    #[test]
+    fn soaking_columnar_view_zero_copy() {
+        let mut buf = SoakingBuffer::new(4, 100);
+        buf.entry_mut(2)[50] = 42;
+        let slice = buf.as_columnar_slice();
+        // Row 2, col 50 → offset 2*100 + 50 = 250
+        assert_eq!(slice[250], 42);
+        assert_eq!(slice.len(), 4 * 100);
+    }
+
+    #[test]
+    fn soaking_columnar_view_mut() {
+        let mut buf = SoakingBuffer::new(2, 10);
+        buf.as_columnar_slice_mut()[15] = -7; // Row 1, col 5
+        assert_eq!(buf.entry(1)[5], -7);
+    }
+
+    #[test]
+    fn plane_buffer_binary_slice() {
+        let mut pb = PlaneBuffer::new(3, BINARY_BYTES);
+        pb.binary_entry_mut(1)[0] = 0xAB;
+        let slice = pb.as_binary_slice();
+        assert_eq!(slice.len(), 3 * BINARY_BYTES);
+        assert_eq!(slice[BINARY_BYTES], 0xAB);
+    }
 }
diff --git a/src/hpc/kernels.rs b/src/hpc/kernels.rs
@@ -1000,6 +1000,104 @@ impl BenchmarkTranscript {
     }
 }
 
+// ============================================================================
+// simd_apply — Generic fused SIMD kernel over aligned f32 slices
+// ============================================================================
+
+use crate::simd::F32x16;
+
+/// Apply a generic SIMD operation element-wise over two aligned f32 slices.
+///
+/// Processes 16 elements per iteration using `F32x16`, with a scalar tail.
+/// This is the generic fusion primitive: callers pass any `Fn(F32x16, F32x16) -> F32x16`.
+///
+/// # Examples
+///
+/// ```ignore
+/// use ndarray::hpc::kernels::simd_apply;
+/// let a = vec![1.0f32; 64];
+/// let b = vec![2.0f32; 64];
+/// let mut out = vec![0.0f32; 64];
+/// // Fused multiply-add: a * b + a
+/// simd_apply(&a, &b, &mut out, |va, vb| va.mul_add(vb, va));
+/// ```
+#[inline]
+pub fn simd_apply<F>(a: &[f32], b: &[f32], out: &mut [f32], f: F)
+where
+    F: Fn(F32x16, F32x16) -> F32x16,
+{
+    let n = a.len().min(b.len()).min(out.len());
+    let mut i = 0;
+    while i + 16 <= n {
+        let va = F32x16::from_slice(&a[i..]);
+        let vb = F32x16::from_slice(&b[i..]);
+        f(va, vb).copy_to_slice(&mut out[i..]);
+        i += 16;
+    }
+    // Scalar tail: extract one lane at a time
+    if i < n {
+        let tail_len = n - i;
+        let mut a_pad = [0.0f32; 16];
+        let mut b_pad = [0.0f32; 16];
+        a_pad[..tail_len].copy_from_slice(&a[i..n]);
+        b_pad[..tail_len].copy_from_slice(&b[i..n]);
+        let result = f(F32x16::from_array(a_pad), F32x16::from_array(b_pad));
+        let arr = result.to_array();
+        out[i..n].copy_from_slice(&arr[..tail_len]);
+    }
+}
+
+/// Apply a generic unary SIMD operation element-wise over an f32 slice.
+///
+/// Single-input variant of [`simd_apply`].
+#[inline]
+pub fn simd_apply_unary<F>(x: &[f32], out: &mut [f32], f: F)
+where
+    F: Fn(F32x16) -> F32x16,
+{
+    let n = x.len().min(out.len());
+    let mut i = 0;
+    while i + 16 <= n {
+        let v = F32x16::from_slice(&x[i..]);
+        f(v).copy_to_slice(&mut out[i..]);
+        i += 16;
+    }
+    if i < n {
+        let tail_len = n - i;
+        let mut pad = [0.0f32; 16];
+        pad[..tail_len].copy_from_slice(&x[i..n]);
+        let result = f(F32x16::from_array(pad));
+        let arr = result.to_array();
+        out[i..n].copy_from_slice(&arr[..tail_len]);
+    }
+}
+
+/// Apply a generic SIMD operation in-place: `a[i] = f(a[i], b[i])`.
+#[inline]
+pub fn simd_apply_inplace<F>(a: &mut [f32], b: &[f32], f: F)
+where
+    F: Fn(F32x16, F32x16) -> F32x16,
+{
+    let n = a.len().min(b.len());
+    let mut i = 0;
+    while i + 16 <= n {
+        let va = F32x16::from_slice(&a[i..]);
+        let vb = F32x16::from_slice(&b[i..]);
+        f(va, vb).copy_to_slice(&mut a[i..]);
+        i += 16;
+    }
+    if i < n {
+        let tail_len = n - i;
+        let mut a_pad = [0.0f32; 16];
+        let mut b_pad = [0.0f32; 16];
+        a_pad[..tail_len].copy_from_slice(&a[i..n]);
+        b_pad[..tail_len].copy_from_slice(&b[i..n]);
+        let result = f(F32x16::from_array(a_pad), F32x16::from_array(b_pad));
+        let arr = result.to_array();
+        a[i..n].copy_from_slice(&arr[..tail_len]);
+    }
+}
+
 // ============================================================================
 // Tests
 // ============================================================================
@@ -1586,4 +1684,68 @@ mod tests {
         assert_eq!(exact.sigma.level, SignificanceLevel::Discovery);
         assert!(exact.sigma.sigma > 100.0);
     }
+
+    // --- simd_apply tests ---
+
+    #[test]
+    fn test_simd_apply_add() {
+        let a: Vec<f32> = (0..100).map(|i| i as f32).collect();
+        let b: Vec<f32> = (0..100).map(|i| (i * 2) as f32).collect();
+        let mut out = vec![0.0f32; 100];
+        simd_apply(&a, &b, &mut out, |va, vb| va + vb);
+        for i in 0..100 {
+            assert_eq!(out[i], (i + i * 2) as f32, "mismatch at {i}");
+        }
+    }
+
+    #[test]
+    fn test_simd_apply_fma() {
+        let a = vec![2.0f32; 35]; // Not divisible by 16 — tests tail
+        let b = vec![3.0f32; 35];
+        let mut out = vec![0.0f32; 35];
+        // a * b + a = 2*3 + 2 = 8
+        simd_apply(&a, &b, &mut out, |va, vb| va.mul_add(vb, va));
+        for i in 0..35 {
+            assert!((out[i] - 8.0).abs() < 1e-5, "mismatch at {i}: {}", out[i]);
+        }
+    }
+
+    #[test]
+    fn test_simd_apply_unary_sqrt() {
+        let x: Vec<f32> = (1..=50).map(|i| (i * i) as f32).collect();
+        let mut out = vec![0.0f32; 50];
+        simd_apply_unary(&x, &mut out, |v| v.sqrt());
+        for i in 0..50 {
+            assert!((out[i] - (i + 1) as f32).abs() < 1e-4, "mismatch at {i}");
+        }
+    }
+
+    #[test]
+    fn test_simd_apply_inplace() {
+        let mut a: Vec<f32> = (0..48).map(|i| i as f32).collect();
+        let b = vec![1.0f32; 48];
+        simd_apply_inplace(&mut a, &b, |va, vb| va + vb);
+        for i in 0..48 {
+            assert_eq!(a[i], (i + 1) as f32);
+        }
+    }
+
+    #[test]
+    fn test_simd_apply_empty() {
+        let a: Vec<f32> = vec![];
+        let b: Vec<f32> = vec![];
+        let mut out: Vec<f32> = vec![];
+        simd_apply(&a, &b, &mut out, |va, vb| va + vb);
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn test_simd_apply_small_tail_only() {
+        // Only 3 elements — entirely tail path
+        let a = vec![1.0f32, 2.0, 3.0];
+        let b = vec![4.0f32, 5.0, 6.0];
+        let mut out = vec![0.0f32; 3];
+        simd_apply(&a, &b, &mut out, |va, vb| va * vb);
+        assert_eq!(out, [4.0, 10.0, 18.0]);
+    }
 }
diff --git a/src/hpc/packed.rs b/src/hpc/packed.rs
@@ -19,6 +19,44 @@
 
 use super::bitwise;
 
+/// Software prefetch: bring a cache line into L1 for the given byte slice.
+///
+/// No-op on non-x86 targets. On x86_64, uses `_mm_prefetch(_MM_HINT_T0)`.
+/// The prefetch distance (how many candidates ahead) should be tuned per
+/// cache hierarchy — 4 candidates × 128B = 512B ≈ 8 cache lines is a
+/// reasonable default for Stroke 1 sequential scan.
+#[inline(always)]
+#[allow(unused_variables)]
+fn prefetch_t0(ptr: *const u8) {
+    #[cfg(target_arch = "x86_64")]
+    unsafe {
+        #[cfg(target_feature = "sse")]
+        {
+            core::arch::x86_64::_mm_prefetch::<{ core::arch::x86_64::_MM_HINT_T0 }>(
+                ptr as *const i8,
+            );
+        }
+    }
+}
+
+/// Prefetch into L2 (non-temporal hint for data accessed once).
+#[inline(always)]
+#[allow(unused_variables)]
+fn prefetch_t1(ptr: *const u8) {
+    #[cfg(target_arch = "x86_64")]
+    unsafe {
+        #[cfg(target_feature = "sse")]
+        {
+            core::arch::x86_64::_mm_prefetch::<{ core::arch::x86_64::_MM_HINT_T1 }>(
+                ptr as *const i8,
+            );
+        }
+    }
+}
+
+/// Number of candidates to prefetch ahead in the Stroke 1 scan.
+const PREFETCH_DISTANCE: usize = 4;
+
 /// Stroke 1: first 128 bytes (1024 bits) — coarse rejection (~90% eliminated).
 pub const STROKE1_BYTES: usize = 128;
 /// Stroke 2: bytes 128..512 (3072 bits) — medium filter (~90% of survivors).
@@ -161,17 +199,28 @@ impl PackedDatabase {
         let query_s3 = &query[STROKE1_BYTES + STROKE2_BYTES..FINGERPRINT_BYTES];
 
         // STROKE 1: coarse rejection — sequential scan through packed stroke1
+        // Prefetch PREFETCH_DISTANCE candidates ahead to hide memory latency.
         let mut survivors: Vec<(usize, u64)> = Vec::with_capacity(self.num_candidates / 10);
         for i in 0..self.num_candidates {
+            // Prefetch stroke1 data for upcoming candidate
+            if i + PREFETCH_DISTANCE < self.num_candidates {
+                prefetch_t0(self.stroke1[(i + PREFETCH_DISTANCE) * STROKE1_BYTES..].as_ptr());
+            }
             let d1 = bitwise::hamming_distance_raw(query_s1, self.get_stroke1(i));
             if d1 <= reject_threshold_s1 {
                 survivors.push((i, d1));
             }
         }
 
         // STROKE 2: medium filter — scan survivors through packed stroke2
+        // Prefetch stroke2 data for next survivor (sparse access pattern).
         let mut survivors2: Vec<(usize, u64)> = Vec::with_capacity(survivors.len() / 10);
-        for &(idx, d1) in &survivors {
+        for (si, &(idx, d1)) in survivors.iter().enumerate() {
+            // Prefetch next survivor's stroke2 data into L2
+            if si + 1 < survivors.len() {
+                let next_idx = survivors[si + 1].0;
+                prefetch_t1(self.stroke2[next_idx * STROKE2_BYTES..].as_ptr());
+            }
             let d2 = bitwise::hamming_distance_raw(query_s2, self.get_stroke2(idx));
             let d_cumul = d1 + d2;
             if d_cumul <= reject_threshold_s12 {
@@ -180,8 +229,13 @@ impl PackedDatabase {
         }
 
         // STROKE 3: precise distance — final ranking
+        // Prefetch next survivor's stroke3 data.
         let mut results: Vec<RankedHit> = Vec::with_capacity(survivors2.len());
-        for &(idx, d12) in &survivors2 {
+        for (si, &(idx, d12)) in survivors2.iter().enumerate() {
+            if si + 1 < survivors2.len() {
+                let next_idx = survivors2[si + 1].0;
+                prefetch_t0(self.stroke3[next_idx * STROKE3_BYTES..].as_ptr());
+            }
             let d3 = bitwise::hamming_distance_raw(query_s3, self.get_stroke3(idx));
             results.push(RankedHit {
                 index: self.original_id(idx) as usize,
diff --git a/src/hpc/vml.rs b/src/hpc/vml.rs