|
3 | 3 | //! SIMD-accelerated squared-distance, radius filtering, and K-nearest-neighbor |
4 | 4 | //! searches over contiguous point slices. All operations work on borrowed slices |
5 | 5 | //! with no internal copies. Scalar fallback is provided for non-x86 targets. |
| 6 | +//! |
| 7 | +//! # Slice-shape geometric distance (PR-X10 A6) |
| 8 | +//! |
| 9 | +//! For arbitrary-length f64 slices (non-3D-point shape), use: |
| 10 | +//! |
| 11 | +//! - [`l1_f64_simd`] — Manhattan: `Σ |a_i − b_i|` |
| 12 | +//! - [`l2_f64_simd`] — Euclidean: `√Σ (a_i − b_i)²` |
| 13 | +//! - [`linf_f64_simd`] — Chebyshev: `max |a_i − b_i|` |
| 14 | +//! |
| 15 | +//! These use the `F64x8` polyfill (no `target_feature`, no `unsafe`), |
| 16 | +//! matching the [`crate::hpc::heel_f64x8::cosine_f64_simd`] idiom: F64x8 |
| 17 | +//! chunks with FMA / SIMD-max accumulator + scalar remainder. They are |
| 18 | +//! the salvaged kernels from the rolled-back PR #160 cross-repo arc |
| 19 | +//! (lance-graph `heel_f64x8::{l1, l2, linf}_f64_simd`), re-landed here |
| 20 | +//! per the linalg-core design's A6 worker scope and the |
| 21 | +//! `crate::hpc::linalg/mod.rs` hard boundary ("No distance metrics — |
| 22 | +//! those live in `crate::hpc::distance`"). |
6 | 23 |
|
7 | 24 | // --------------------------------------------------------------------------- |
8 | 25 | // Scalar helpers |
@@ -165,6 +182,108 @@ pub fn knn_f64(query: [f64; 3], points: &[[f64; 3]], k: usize) -> (Vec<usize>, V |
165 | 182 | (indices, sq_dists) |
166 | 183 | } |
167 | 184 |
|
| 185 | +// --------------------------------------------------------------------------- |
| 186 | +// Slice-shape geometric distance — PR-X10 A6 |
| 187 | +// --------------------------------------------------------------------------- |
| 188 | +// |
| 189 | +// Polyfilled F64x8 chunked path with scalar remainder; no `target_feature`, |
| 190 | +// no `unsafe` — the polyfill in `crate::simd::F64x8` owns runtime feature |
| 191 | +// dispatch (AVX-512 native zmm / AVX2 2×ymm / scalar [f64; 8]). |
| 192 | +// |
| 193 | +// All three kernels read `min(a.len(), b.len())` elements. Empty inputs |
| 194 | +// return 0.0. |
| 195 | + |
| 196 | +use crate::simd::F64x8; |
| 197 | + |
| 198 | +/// L1 (Manhattan) distance between two f64 slices: `Σ |a_i − b_i|`. |
| 199 | +/// |
| 200 | +/// EXACT precision class — the per-lane `(a - b).abs()` introduces no |
| 201 | +/// rounding beyond the standard subtract, and the reduce-sum order is |
| 202 | +/// lane-tree within each F64x8 chunk + sequential across chunks (matches |
| 203 | +/// the [`crate::hpc::heel_f64x8::cosine_f64_simd`] order so callers can |
| 204 | +/// reason about determinism the same way). |
| 205 | +/// |
| 206 | +/// Reads `min(a.len(), b.len())` elements. Returns 0.0 for empty inputs. |
| 207 | +pub fn l1_f64_simd(a: &[f64], b: &[f64]) -> f64 { |
| 208 | + let n = a.len().min(b.len()); |
| 209 | + let chunks = n / 8; |
| 210 | + let mut acc = F64x8::splat(0.0); |
| 211 | + for i in 0..chunks { |
| 212 | + let va = F64x8::from_slice(&a[i * 8..]); |
| 213 | + let vb = F64x8::from_slice(&b[i * 8..]); |
| 214 | + acc = acc + (va - vb).abs(); |
| 215 | + } |
| 216 | + let mut sum = acc.reduce_sum(); |
| 217 | + let offset = chunks * 8; |
| 218 | + for i in 0..(n - offset) { |
| 219 | + sum += (a[offset + i] - b[offset + i]).abs(); |
| 220 | + } |
| 221 | + sum |
| 222 | +} |
| 223 | + |
| 224 | +/// L2 (Euclidean) distance between two f64 slices: `√Σ (a_i − b_i)²`. |
| 225 | +/// |
| 226 | +/// VERIFY precision class — the final `sqrt` is one ULP; the sum is |
| 227 | +/// lane-tree within each F64x8 + sequential across chunks (same order |
| 228 | +/// pattern as L1). Determinism across runs holds for fixed slice |
| 229 | +/// length and fixed chunking. For full order-independence use a |
| 230 | +/// pairwise-reduce variant (see `blas_level1::nrm2`). |
| 231 | +/// |
| 232 | +/// Reads `min(a.len(), b.len())` elements. Returns 0.0 for empty inputs. |
| 233 | +pub fn l2_f64_simd(a: &[f64], b: &[f64]) -> f64 { |
| 234 | + let n = a.len().min(b.len()); |
| 235 | + let chunks = n / 8; |
| 236 | + let mut acc = F64x8::splat(0.0); |
| 237 | + for i in 0..chunks { |
| 238 | + let va = F64x8::from_slice(&a[i * 8..]); |
| 239 | + let vb = F64x8::from_slice(&b[i * 8..]); |
| 240 | + let d = va - vb; |
| 241 | + acc = d.mul_add(d, acc); // acc += d*d (single FMA per chunk) |
| 242 | + } |
| 243 | + let mut sum_sq = acc.reduce_sum(); |
| 244 | + let offset = chunks * 8; |
| 245 | + for i in 0..(n - offset) { |
| 246 | + let d = a[offset + i] - b[offset + i]; |
| 247 | + sum_sq += d * d; |
| 248 | + } |
| 249 | + sum_sq.sqrt() |
| 250 | +} |
| 251 | + |
| 252 | +/// L∞ (Chebyshev) distance between two f64 slices: `max |a_i − b_i|`. |
| 253 | +/// |
| 254 | +/// EXACT precision class — `(a - b).abs()` and `max` introduce no |
| 255 | +/// rounding; the result is determined by the inputs alone (order- |
| 256 | +/// independent across chunks since `max` is associative and commutative |
| 257 | +/// under IEEE-754 for non-NaN inputs). |
| 258 | +/// |
| 259 | +/// Reads `min(a.len(), b.len())` elements. Returns 0.0 for empty inputs. |
| 260 | +/// |
| 261 | +/// # NaN handling |
| 262 | +/// |
| 263 | +/// IEEE-754 `_mm512_max_pd` returns the second operand when either input |
| 264 | +/// is NaN; callers passing NaN-tainted slices may observe non-deterministic |
| 265 | +/// max across runs (an upstream constraint, not a kernel bug). Audit |
| 266 | +/// upstream for NaN before relying on this kernel on production data. |
| 267 | +pub fn linf_f64_simd(a: &[f64], b: &[f64]) -> f64 { |
| 268 | + let n = a.len().min(b.len()); |
| 269 | + let chunks = n / 8; |
| 270 | + let mut max_v = F64x8::splat(0.0); |
| 271 | + for i in 0..chunks { |
| 272 | + let va = F64x8::from_slice(&a[i * 8..]); |
| 273 | + let vb = F64x8::from_slice(&b[i * 8..]); |
| 274 | + max_v = max_v.simd_max((va - vb).abs()); |
| 275 | + } |
| 276 | + let mut max_d = max_v.reduce_max(); |
| 277 | + let offset = chunks * 8; |
| 278 | + for i in 0..(n - offset) { |
| 279 | + let d = (a[offset + i] - b[offset + i]).abs(); |
| 280 | + if d > max_d { |
| 281 | + max_d = d; |
| 282 | + } |
| 283 | + } |
| 284 | + max_d |
| 285 | +} |
| 286 | + |
168 | 287 | // --------------------------------------------------------------------------- |
169 | 288 | // Tests |
170 | 289 | // --------------------------------------------------------------------------- |
@@ -315,4 +434,178 @@ mod tests { |
315 | 434 | let result = squared_distances_f32(query, &points); |
316 | 435 | assert!(approx_eq_f32(result[0], 0.0)); |
317 | 436 | } |
| 437 | + |
| 438 | + // -- PR-X10 A6 slice-shape L1 / L2 / L∞ -- |
| 439 | + |
| 440 | + fn approx_eq_f64_tol(a: f64, b: f64, tol: f64) -> bool { |
| 441 | + (a - b).abs() < tol |
| 442 | + } |
| 443 | + |
| 444 | + /// Deterministic SplitMix64 — matches the pillar harness so the |
| 445 | + /// corpus is reproducible across runs and across machines. |
| 446 | + fn splitmix(state: &mut u64) -> u64 { |
| 447 | + *state = state.wrapping_add(0x9E37_79B9_7F4A_7C15); |
| 448 | + let mut z = *state; |
| 449 | + z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); |
| 450 | + z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); |
| 451 | + z ^ (z >> 31) |
| 452 | + } |
| 453 | + |
| 454 | + fn random_vec_f64(seed: u64, n: usize) -> Vec<f64> { |
| 455 | + let mut s = seed; |
| 456 | + (0..n) |
| 457 | + .map(|_| { |
| 458 | + let bits = splitmix(&mut s) >> 11; |
| 459 | + (bits as f64) / (1u64 << 53) as f64 * 2.0 - 1.0 // uniform in [-1, 1) |
| 460 | + }) |
| 461 | + .collect() |
| 462 | + } |
| 463 | + |
| 464 | + // -- L1 boundary + parity -- |
| 465 | + |
| 466 | + #[test] |
| 467 | + fn l1_f64_simd_self_zero() { |
| 468 | + let a = random_vec_f64(0xC1A0, 200); |
| 469 | + assert_eq!(l1_f64_simd(&a, &a), 0.0); |
| 470 | + } |
| 471 | + |
| 472 | + #[test] |
| 473 | + fn l1_f64_simd_empty_is_zero() { |
| 474 | + let a: Vec<f64> = vec![]; |
| 475 | + let b: Vec<f64> = vec![]; |
| 476 | + assert_eq!(l1_f64_simd(&a, &b), 0.0); |
| 477 | + } |
| 478 | + |
| 479 | + #[test] |
| 480 | + fn l1_f64_simd_uniform_diff() { |
| 481 | + let a = vec![3.0f64; 17]; |
| 482 | + let b = vec![1.0f64; 17]; |
| 483 | + // 17 * |3 - 1| = 34 |
| 484 | + assert!(approx_eq_f64_tol(l1_f64_simd(&a, &b), 34.0, 1e-12)); |
| 485 | + } |
| 486 | + |
| 487 | + #[test] |
| 488 | + fn l1_f64_simd_matches_scalar() { |
| 489 | + // 200 elements covers chunked path (25 chunks of 8) + remainder of 0; |
| 490 | + // 199 covers chunked + remainder of 7. |
| 491 | + for &n in &[1usize, 7, 8, 15, 16, 17, 64, 199, 200, 1024] { |
| 492 | + let a = random_vec_f64(0xA110_C1A0, n); |
| 493 | + let b = random_vec_f64(0xB220_C1A0, n); |
| 494 | + let simd = l1_f64_simd(&a, &b); |
| 495 | + let scalar: f64 = a.iter().zip(&b).map(|(x, y)| (x - y).abs()).sum(); |
| 496 | + assert!( |
| 497 | + approx_eq_f64_tol(simd, scalar, 1e-11), |
| 498 | + "n={} simd={:.15} scalar={:.15}", |
| 499 | + n, |
| 500 | + simd, |
| 501 | + scalar |
| 502 | + ); |
| 503 | + } |
| 504 | + } |
| 505 | + |
| 506 | + // -- L2 boundary + parity -- |
| 507 | + |
| 508 | + #[test] |
| 509 | + fn l2_f64_simd_self_zero() { |
| 510 | + let a = random_vec_f64(0xC2A0, 200); |
| 511 | + assert_eq!(l2_f64_simd(&a, &a), 0.0); |
| 512 | + } |
| 513 | + |
| 514 | + #[test] |
| 515 | + fn l2_f64_simd_empty_is_zero() { |
| 516 | + let a: Vec<f64> = vec![]; |
| 517 | + let b: Vec<f64> = vec![]; |
| 518 | + assert_eq!(l2_f64_simd(&a, &b), 0.0); |
| 519 | + } |
| 520 | + |
| 521 | + #[test] |
| 522 | + fn l2_f64_simd_pythagoras() { |
| 523 | + // (3, 0, …) vs (0, 4, …): √(9 + 16) = 5 |
| 524 | + let a = vec![3.0f64, 0.0]; |
| 525 | + let b = vec![0.0f64, 4.0]; |
| 526 | + assert!(approx_eq_f64_tol(l2_f64_simd(&a, &b), 5.0, 1e-12)); |
| 527 | + } |
| 528 | + |
| 529 | + #[test] |
| 530 | + fn l2_f64_simd_matches_scalar() { |
| 531 | + for &n in &[1usize, 7, 8, 15, 16, 17, 64, 199, 200, 1024] { |
| 532 | + let a = random_vec_f64(0xA110_C2A0, n); |
| 533 | + let b = random_vec_f64(0xB220_C2A0, n); |
| 534 | + let simd = l2_f64_simd(&a, &b); |
| 535 | + let sum_sq: f64 = a.iter().zip(&b).map(|(x, y)| (x - y).powi(2)).sum(); |
| 536 | + let scalar = sum_sq.sqrt(); |
| 537 | + // Sqrt is 1 ULP; cross-chunk summation order differs by chunks |
| 538 | + // of 8 vs sequential — allow generous relative tolerance. |
| 539 | + let rel = (simd - scalar).abs() / scalar.max(1e-12); |
| 540 | + assert!( |
| 541 | + rel < 1e-10, |
| 542 | + "n={} simd={:.15} scalar={:.15} rel={:.2e}", |
| 543 | + n, |
| 544 | + simd, |
| 545 | + scalar, |
| 546 | + rel |
| 547 | + ); |
| 548 | + } |
| 549 | + } |
| 550 | + |
| 551 | + // -- L∞ boundary + parity -- |
| 552 | + |
| 553 | + #[test] |
| 554 | + fn linf_f64_simd_self_zero() { |
| 555 | + let a = random_vec_f64(0xC1FF, 200); |
| 556 | + assert_eq!(linf_f64_simd(&a, &a), 0.0); |
| 557 | + } |
| 558 | + |
| 559 | + #[test] |
| 560 | + fn linf_f64_simd_empty_is_zero() { |
| 561 | + let a: Vec<f64> = vec![]; |
| 562 | + let b: Vec<f64> = vec![]; |
| 563 | + assert_eq!(linf_f64_simd(&a, &b), 0.0); |
| 564 | + } |
| 565 | + |
| 566 | + #[test] |
| 567 | + fn linf_f64_simd_picks_max_in_chunk() { |
| 568 | + // Max difference must land inside a chunked path (index 5 < 8) and |
| 569 | + // also outside (index 13 > 8) to exercise both halves. |
| 570 | + let mut a = vec![0.0f64; 16]; |
| 571 | + let mut b = vec![0.0f64; 16]; |
| 572 | + a[5] = 0.5; |
| 573 | + a[13] = -0.7; // |Δ| = 0.7 — should win |
| 574 | + b[2] = 0.1; |
| 575 | + assert!(approx_eq_f64_tol(linf_f64_simd(&a, &b), 0.7, 1e-12)); |
| 576 | + } |
| 577 | + |
| 578 | + #[test] |
| 579 | + fn linf_f64_simd_matches_scalar() { |
| 580 | + for &n in &[1usize, 7, 8, 15, 16, 17, 64, 199, 200, 1024] { |
| 581 | + let a = random_vec_f64(0xA110_C1FF, n); |
| 582 | + let b = random_vec_f64(0xB220_C1FF, n); |
| 583 | + let simd = linf_f64_simd(&a, &b); |
| 584 | + let scalar: f64 = a |
| 585 | + .iter() |
| 586 | + .zip(&b) |
| 587 | + .map(|(x, y)| (x - y).abs()) |
| 588 | + .fold(0.0_f64, f64::max); |
| 589 | + assert!( |
| 590 | + approx_eq_f64_tol(simd, scalar, 1e-15), |
| 591 | + "n={} simd={:.15} scalar={:.15}", |
| 592 | + n, |
| 593 | + simd, |
| 594 | + scalar |
| 595 | + ); |
| 596 | + } |
| 597 | + } |
| 598 | + |
| 599 | + /// Mismatched-length slices: must use the shorter length, no panic. |
| 600 | + #[test] |
| 601 | + fn slice_distances_mismatched_length_uses_min() { |
| 602 | + let a = vec![1.0f64; 17]; |
| 603 | + let b = vec![2.0f64; 10]; |
| 604 | + // L1 over min=10: 10 * |1 - 2| = 10 |
| 605 | + assert!(approx_eq_f64_tol(l1_f64_simd(&a, &b), 10.0, 1e-12)); |
| 606 | + // L2 over min=10: √(10 * 1) = √10 |
| 607 | + assert!(approx_eq_f64_tol(l2_f64_simd(&a, &b), 10f64.sqrt(), 1e-12)); |
| 608 | + // L∞ = 1 |
| 609 | + assert!(approx_eq_f64_tol(linf_f64_simd(&a, &b), 1.0, 1e-12)); |
| 610 | + } |
318 | 611 | } |
0 commit comments