Skip to content

Commit efacb94

Browse files
committed
feat(hpc): SIMD wishlist items #1, #5, #6, #10
#1 VML wiring: Wire F32x16/F64x8 SIMD types into scalar VML loops - vsln: 16-wide via simd_ln_f32 - vdsqrt: 8-wide via F64x8::sqrt() - vdabs: 8-wide via F64x8::abs() - vssin/vscos: batch load/store via F32x16 (scalar per-lane, SIMD framework) - vspow: 16-wide via exp(b*ln(a)) using simd_exp_f32 + simd_ln_f32 - 7 new tests covering SIMD paths #5 columnar_view: Zero-copy Arrow interop - SoakingBuffer::as_columnar_slice() / as_columnar_slice_mut() - PlaneBuffer::as_binary_slice() - 3 new tests for zero-copy view correctness #6 simd_apply: Generic fused SIMD kernel - simd_apply(a, b, out, Fn(F32x16, F32x16) -> F32x16) - simd_apply_unary(x, out, Fn(F32x16) -> F32x16) - simd_apply_inplace(a, b, Fn(F32x16, F32x16) -> F32x16) - Proper tail handling via zero-padded SIMD - 6 new tests (add, FMA, sqrt, inplace, empty, tail-only) #10 prefetch: Explicit _mm_prefetch in cascade_query - prefetch_t0/t1 wrappers (x86_64 SSE, no-op elsewhere) - Stroke 1: prefetch PREFETCH_DISTANCE=4 candidates ahead (L1) - Stroke 2: prefetch next survivor's data (L2) - Stroke 3: prefetch next survivor's data (L1) All 51 targeted tests pass. Scorecard: 4/10 → 6/10 done. https://claude.ai/code/session_01CdqyUTUfjKZuk8YGJzv6LB
1 parent 85f5fb4 commit efacb94

4 files changed

Lines changed: 434 additions & 15 deletions

File tree

src/hpc/arrow_bridge.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,22 @@ impl SoakingBuffer {
115115
&mut self.data[start..start + self.n_dims]
116116
}
117117

118+
/// Zero-copy columnar view of the entire soaking buffer as a flat `&[i8]` slice.
119+
///
120+
/// Layout: `n_entries × n_dims` row-major. Use with
121+
/// `ArrayView2::from_shape((n_entries, n_dims), buf.as_columnar_slice())`
122+
/// for zero-copy ndarray interop.
123+
#[inline]
124+
pub fn as_columnar_slice(&self) -> &[i8] {
125+
&self.data
126+
}
127+
128+
/// Mutable zero-copy view of the entire soaking buffer.
129+
#[inline]
130+
pub fn as_columnar_slice_mut(&mut self) -> &mut [i8] {
131+
&mut self.data
132+
}
133+
118134
/// Crystallize: convert soaking (int8) to binary fingerprint via sign().
119135
pub fn crystallize(&self, idx: usize) -> Vec<u8> {
120136
let entry = self.entry(idx);
@@ -162,6 +178,16 @@ impl PlaneBuffer {
162178
let start = idx * self.binary_bytes;
163179
&mut self.binary[start..start + self.binary_bytes]
164180
}
181+
182+
/// Zero-copy columnar view of all binary fingerprints as a flat `&[u8]` slice.
183+
///
184+
/// Layout: `n_entries × binary_bytes` row-major. Use with
185+
/// `ArrayView2::from_shape((n_entries, binary_bytes), buf.as_binary_slice())`
186+
/// for zero-copy ndarray interop.
187+
#[inline]
188+
pub fn as_binary_slice(&self) -> &[u8] {
189+
&self.binary
190+
}
165191
}
166192

167193
/// Three-plane fingerprint buffer (S, P, O).
@@ -928,4 +954,32 @@ mod tests {
928954
let dist_raw = hamming_distance_raw(&node.subject_binary, &node.predicate_binary);
929955
assert_eq!(dist_fp as u64, dist_raw);
930956
}
957+
958+
// --- columnar_view tests ---
959+
960+
#[test]
961+
fn soaking_columnar_view_zero_copy() {
962+
let mut buf = SoakingBuffer::new(4, 100);
963+
buf.entry_mut(2)[50] = 42;
964+
let slice = buf.as_columnar_slice();
965+
// Row 2, col 50 → offset 2*100 + 50 = 250
966+
assert_eq!(slice[250], 42);
967+
assert_eq!(slice.len(), 4 * 100);
968+
}
969+
970+
#[test]
971+
fn soaking_columnar_view_mut() {
972+
let mut buf = SoakingBuffer::new(2, 10);
973+
buf.as_columnar_slice_mut()[15] = -7; // Row 1, col 5
974+
assert_eq!(buf.entry(1)[5], -7);
975+
}
976+
977+
#[test]
978+
fn plane_buffer_binary_slice() {
979+
let mut pb = PlaneBuffer::new(3, BINARY_BYTES);
980+
pb.binary_entry_mut(1)[0] = 0xAB;
981+
let slice = pb.as_binary_slice();
982+
assert_eq!(slice.len(), 3 * BINARY_BYTES);
983+
assert_eq!(slice[BINARY_BYTES], 0xAB);
984+
}
931985
}

src/hpc/kernels.rs

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,104 @@ impl BenchmarkTranscript {
10001000
}
10011001
}
10021002

1003+
// ============================================================================
1004+
// simd_apply — Generic fused SIMD kernel over aligned f32 slices
1005+
// ============================================================================
1006+
1007+
use crate::simd::F32x16;
1008+
1009+
/// Apply a generic SIMD operation element-wise over two aligned f32 slices.
1010+
///
1011+
/// Processes 16 elements per iteration using `F32x16`, with a scalar tail.
1012+
/// This is the generic fusion primitive: callers pass any `Fn(F32x16, F32x16) -> F32x16`.
1013+
///
1014+
/// # Examples
1015+
///
1016+
/// ```ignore
1017+
/// use ndarray::hpc::kernels::simd_apply;
1018+
/// let a = vec![1.0f32; 64];
1019+
/// let b = vec![2.0f32; 64];
1020+
/// let mut out = vec![0.0f32; 64];
1021+
/// // Fused multiply-add: a * b + a
1022+
/// simd_apply(&a, &b, &mut out, |va, vb| va.mul_add(vb, va));
1023+
/// ```
1024+
#[inline]
1025+
pub fn simd_apply<F>(a: &[f32], b: &[f32], out: &mut [f32], f: F)
1026+
where
1027+
F: Fn(F32x16, F32x16) -> F32x16,
1028+
{
1029+
let n = a.len().min(b.len()).min(out.len());
1030+
let mut i = 0;
1031+
while i + 16 <= n {
1032+
let va = F32x16::from_slice(&a[i..]);
1033+
let vb = F32x16::from_slice(&b[i..]);
1034+
f(va, vb).copy_to_slice(&mut out[i..]);
1035+
i += 16;
1036+
}
1037+
// Scalar tail: extract one lane at a time
1038+
if i < n {
1039+
let tail_len = n - i;
1040+
let mut a_pad = [0.0f32; 16];
1041+
let mut b_pad = [0.0f32; 16];
1042+
a_pad[..tail_len].copy_from_slice(&a[i..n]);
1043+
b_pad[..tail_len].copy_from_slice(&b[i..n]);
1044+
let result = f(F32x16::from_array(a_pad), F32x16::from_array(b_pad));
1045+
let arr = result.to_array();
1046+
out[i..n].copy_from_slice(&arr[..tail_len]);
1047+
}
1048+
}
1049+
1050+
/// Apply a generic unary SIMD operation element-wise over an f32 slice.
1051+
///
1052+
/// Single-input variant of [`simd_apply`].
1053+
#[inline]
1054+
pub fn simd_apply_unary<F>(x: &[f32], out: &mut [f32], f: F)
1055+
where
1056+
F: Fn(F32x16) -> F32x16,
1057+
{
1058+
let n = x.len().min(out.len());
1059+
let mut i = 0;
1060+
while i + 16 <= n {
1061+
let v = F32x16::from_slice(&x[i..]);
1062+
f(v).copy_to_slice(&mut out[i..]);
1063+
i += 16;
1064+
}
1065+
if i < n {
1066+
let tail_len = n - i;
1067+
let mut pad = [0.0f32; 16];
1068+
pad[..tail_len].copy_from_slice(&x[i..n]);
1069+
let result = f(F32x16::from_array(pad));
1070+
let arr = result.to_array();
1071+
out[i..n].copy_from_slice(&arr[..tail_len]);
1072+
}
1073+
}
1074+
1075+
/// Apply a generic SIMD operation in-place: `a[i] = f(a[i], b[i])`.
1076+
#[inline]
1077+
pub fn simd_apply_inplace<F>(a: &mut [f32], b: &[f32], f: F)
1078+
where
1079+
F: Fn(F32x16, F32x16) -> F32x16,
1080+
{
1081+
let n = a.len().min(b.len());
1082+
let mut i = 0;
1083+
while i + 16 <= n {
1084+
let va = F32x16::from_slice(&a[i..]);
1085+
let vb = F32x16::from_slice(&b[i..]);
1086+
f(va, vb).copy_to_slice(&mut a[i..]);
1087+
i += 16;
1088+
}
1089+
if i < n {
1090+
let tail_len = n - i;
1091+
let mut a_pad = [0.0f32; 16];
1092+
let mut b_pad = [0.0f32; 16];
1093+
a_pad[..tail_len].copy_from_slice(&a[i..n]);
1094+
b_pad[..tail_len].copy_from_slice(&b[i..n]);
1095+
let result = f(F32x16::from_array(a_pad), F32x16::from_array(b_pad));
1096+
let arr = result.to_array();
1097+
a[i..n].copy_from_slice(&arr[..tail_len]);
1098+
}
1099+
}
1100+
10031101
// ============================================================================
10041102
// Tests
10051103
// ============================================================================
@@ -1586,4 +1684,68 @@ mod tests {
15861684
assert_eq!(exact.sigma.level, SignificanceLevel::Discovery);
15871685
assert!(exact.sigma.sigma > 100.0);
15881686
}
1687+
1688+
// --- simd_apply tests ---
1689+
1690+
#[test]
1691+
fn test_simd_apply_add() {
1692+
let a: Vec<f32> = (0..100).map(|i| i as f32).collect();
1693+
let b: Vec<f32> = (0..100).map(|i| (i * 2) as f32).collect();
1694+
let mut out = vec![0.0f32; 100];
1695+
simd_apply(&a, &b, &mut out, |va, vb| va + vb);
1696+
for i in 0..100 {
1697+
assert_eq!(out[i], (i + i * 2) as f32, "mismatch at {i}");
1698+
}
1699+
}
1700+
1701+
#[test]
1702+
fn test_simd_apply_fma() {
1703+
let a = vec![2.0f32; 35]; // Not divisible by 16 — tests tail
1704+
let b = vec![3.0f32; 35];
1705+
let mut out = vec![0.0f32; 35];
1706+
// a * b + a = 2*3 + 2 = 8
1707+
simd_apply(&a, &b, &mut out, |va, vb| va.mul_add(vb, va));
1708+
for i in 0..35 {
1709+
assert!((out[i] - 8.0).abs() < 1e-5, "mismatch at {i}: {}", out[i]);
1710+
}
1711+
}
1712+
1713+
#[test]
1714+
fn test_simd_apply_unary_sqrt() {
1715+
let x: Vec<f32> = (1..=50).map(|i| (i * i) as f32).collect();
1716+
let mut out = vec![0.0f32; 50];
1717+
simd_apply_unary(&x, &mut out, |v| v.sqrt());
1718+
for i in 0..50 {
1719+
assert!((out[i] - (i + 1) as f32).abs() < 1e-4, "mismatch at {i}");
1720+
}
1721+
}
1722+
1723+
#[test]
1724+
fn test_simd_apply_inplace() {
1725+
let mut a: Vec<f32> = (0..48).map(|i| i as f32).collect();
1726+
let b = vec![1.0f32; 48];
1727+
simd_apply_inplace(&mut a, &b, |va, vb| va + vb);
1728+
for i in 0..48 {
1729+
assert_eq!(a[i], (i + 1) as f32);
1730+
}
1731+
}
1732+
1733+
#[test]
1734+
fn test_simd_apply_empty() {
1735+
let a: Vec<f32> = vec![];
1736+
let b: Vec<f32> = vec![];
1737+
let mut out: Vec<f32> = vec![];
1738+
simd_apply(&a, &b, &mut out, |va, vb| va + vb);
1739+
assert!(out.is_empty());
1740+
}
1741+
1742+
#[test]
1743+
fn test_simd_apply_small_tail_only() {
1744+
// Only 3 elements — entirely tail path
1745+
let a = vec![1.0f32, 2.0, 3.0];
1746+
let b = vec![4.0f32, 5.0, 6.0];
1747+
let mut out = vec![0.0f32; 3];
1748+
simd_apply(&a, &b, &mut out, |va, vb| va * vb);
1749+
assert_eq!(out, [4.0, 10.0, 18.0]);
1750+
}
15891751
}

src/hpc/packed.rs

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,44 @@
1919
2020
use super::bitwise;
2121

22+
/// Software prefetch: bring a cache line into L1 for the given byte slice.
23+
///
24+
/// No-op on non-x86 targets. On x86_64, uses `_mm_prefetch(_MM_HINT_T0)`.
25+
/// The prefetch distance (how many candidates ahead) should be tuned per
26+
/// cache hierarchy — 4 candidates × 128B = 512B ≈ 8 cache lines is a
27+
/// reasonable default for Stroke 1 sequential scan.
28+
#[inline(always)]
29+
#[allow(unused_variables)]
30+
fn prefetch_t0(ptr: *const u8) {
31+
#[cfg(target_arch = "x86_64")]
32+
unsafe {
33+
#[cfg(target_feature = "sse")]
34+
{
35+
core::arch::x86_64::_mm_prefetch::<{ core::arch::x86_64::_MM_HINT_T0 }>(
36+
ptr as *const i8,
37+
);
38+
}
39+
}
40+
}
41+
42+
/// Prefetch into L2 (non-temporal hint for data accessed once).
43+
#[inline(always)]
44+
#[allow(unused_variables)]
45+
fn prefetch_t1(ptr: *const u8) {
46+
#[cfg(target_arch = "x86_64")]
47+
unsafe {
48+
#[cfg(target_feature = "sse")]
49+
{
50+
core::arch::x86_64::_mm_prefetch::<{ core::arch::x86_64::_MM_HINT_T1 }>(
51+
ptr as *const i8,
52+
);
53+
}
54+
}
55+
}
56+
57+
/// Number of candidates to prefetch ahead in the Stroke 1 scan.
58+
const PREFETCH_DISTANCE: usize = 4;
59+
2260
/// Stroke 1: first 128 bytes (1024 bits) — coarse rejection (~90% eliminated).
2361
pub const STROKE1_BYTES: usize = 128;
2462
/// Stroke 2: bytes 128..512 (3072 bits) — medium filter (~90% of survivors).
@@ -161,17 +199,28 @@ impl PackedDatabase {
161199
let query_s3 = &query[STROKE1_BYTES + STROKE2_BYTES..FINGERPRINT_BYTES];
162200

163201
// STROKE 1: coarse rejection — sequential scan through packed stroke1
202+
// Prefetch PREFETCH_DISTANCE candidates ahead to hide memory latency.
164203
let mut survivors: Vec<(usize, u64)> = Vec::with_capacity(self.num_candidates / 10);
165204
for i in 0..self.num_candidates {
205+
// Prefetch stroke1 data for upcoming candidate
206+
if i + PREFETCH_DISTANCE < self.num_candidates {
207+
prefetch_t0(self.stroke1[(i + PREFETCH_DISTANCE) * STROKE1_BYTES..].as_ptr());
208+
}
166209
let d1 = bitwise::hamming_distance_raw(query_s1, self.get_stroke1(i));
167210
if d1 <= reject_threshold_s1 {
168211
survivors.push((i, d1));
169212
}
170213
}
171214

172215
// STROKE 2: medium filter — scan survivors through packed stroke2
216+
// Prefetch stroke2 data for next survivor (sparse access pattern).
173217
let mut survivors2: Vec<(usize, u64)> = Vec::with_capacity(survivors.len() / 10);
174-
for &(idx, d1) in &survivors {
218+
for (si, &(idx, d1)) in survivors.iter().enumerate() {
219+
// Prefetch next survivor's stroke2 data into L2
220+
if si + 1 < survivors.len() {
221+
let next_idx = survivors[si + 1].0;
222+
prefetch_t1(self.stroke2[next_idx * STROKE2_BYTES..].as_ptr());
223+
}
175224
let d2 = bitwise::hamming_distance_raw(query_s2, self.get_stroke2(idx));
176225
let d_cumul = d1 + d2;
177226
if d_cumul <= reject_threshold_s12 {
@@ -180,8 +229,13 @@ impl PackedDatabase {
180229
}
181230

182231
// STROKE 3: precise distance — final ranking
232+
// Prefetch next survivor's stroke3 data.
183233
let mut results: Vec<RankedHit> = Vec::with_capacity(survivors2.len());
184-
for &(idx, d12) in &survivors2 {
234+
for (si, &(idx, d12)) in survivors2.iter().enumerate() {
235+
if si + 1 < survivors2.len() {
236+
let next_idx = survivors2[si + 1].0;
237+
prefetch_t0(self.stroke3[next_idx * STROKE3_BYTES..].as_ptr());
238+
}
185239
let d3 = bitwise::hamming_distance_raw(query_s3, self.get_stroke3(idx));
186240
results.push(RankedHit {
187241
index: self.original_id(idx) as usize,

0 commit comments

Comments
 (0)