AdaWorldAPI
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎rust-toolchain.toml‎
Lines changed: 2 additions & 0 deletions b/‎rust-toolchain.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/core/scent.rs‎
Lines changed: 2 additions & 23 deletions b/‎src/core/scent.rs‎
Lines changed: 2 additions & 23 deletions
diff --git a/‎src/core/simd.rs‎
Lines changed: 8 additions & 144 deletions b/‎src/core/simd.rs‎
Lines changed: 8 additions & 144 deletions
@@ -211,6 +211,7 @@ crewai-vendor = { package = "crewai", path = "../crewai-rust" }
 # -----------------------------------------------------------------------------
 rustynum-rs = { path = "../rustynum/rustynum-rs" }
 rustynum-core = { path = "../rustynum/rustynum-core", features = ["avx512"] }
+rustynum-bnn = { path = "../rustynum/rustynum-bnn", features = ["avx512"] }
 rustynum-arrow = { path = "../rustynum/rustynum-arrow", default-features = false, features = ["arrow"] }
 rustynum-holo = { path = "../rustynum/rustynum-holo", features = ["avx512"] }
 rustynum-clam = { path = "../rustynum/rustynum-clam", features = ["avx512"] }
 
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "stable"
@@ -512,29 +512,8 @@ fn timestamp() -> u64 {
         .unwrap_or(0)
 }
 
-// ========== SIMD Optimized Scent Scan ==========
-
-#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-mod simd {
-    use super::{BUCKETS, SCENT_BYTES, scent_distance};
-
-    /// SIMD-optimized scent scan (AVX2)
-    /// Compares query against 256 scents, returns matching chunk IDs
-    pub fn find_chunks_simd(
-        scents: &[[u8; SCENT_BYTES]; BUCKETS],
-        query: &[u8; SCENT_BYTES],
-        threshold: u32,
-    ) -> Vec<u8> {
-        // For now, fall back to scalar
-        // TODO: Implement AVX2 version
-        scents
-            .iter()
-            .enumerate()
-            .filter(|(_, s)| scent_distance(s, query) <= threshold)
-            .map(|(i, _)| i as u8)
-            .collect()
-    }
-}
+// NOTE: SIMD-optimized scent scan is handled via rustynum runtime dispatch.
+// No compile-time SIMD gates needed — rustynum detects AVX-512/AVX2 at runtime.
 
 #[cfg(test)]
 mod tests {
 
@@ -1,25 +1,24 @@
 //! SIMD-accelerated Hamming distance computation.
 //!
-//! Automatically selects the best implementation:
-//! - AVX-512 VPOPCNTDQ (Intel Ice Lake+, AMD Zen 4+)
-//! - AVX2 + manual popcount
-//! - NEON + CNT (ARM)
-//! - Scalar fallback
+//! All SIMD dispatch is handled by rustynum-core at runtime.
+//! This module is a thin wrapper — ladybug-rs NEVER reimplements SIMD.
+//!
+//! Dispatch path:
+//!   AVX-512 VPOPCNTDQ → AVX2 Harley-Seal → scalar POPCNT
+//!   (one binary, all CPUs, runtime CPUID detection)
 
 use crate::FINGERPRINT_U64;
 use crate::core::Fingerprint;
 
 /// Compute Hamming distance between two fingerprints.
 ///
-/// Uses runtime-dispatched SIMD via rustynum (works on any x86_64 CPU
-/// without compile-time `-C target-feature`). Detects AVX-512 VPOPCNTDQ
-/// at runtime via `is_x86_feature_detected!()`.
+/// Delegates to rustynum's runtime-dispatched SIMD (AVX-512 → AVX2 → scalar).
 #[inline]
 pub fn hamming_distance(a: &Fingerprint, b: &Fingerprint) -> u32 {
     crate::core::rustynum_accel::fingerprint_hamming(a, b)
 }
 
-/// Scalar implementation (works everywhere)
+/// Scalar reference implementation (for tests only).
 #[inline]
 pub fn hamming_scalar(a: &Fingerprint, b: &Fingerprint) -> u32 {
     let a_data = a.as_raw();
@@ -32,141 +31,6 @@ pub fn hamming_scalar(a: &Fingerprint, b: &Fingerprint) -> u32 {
     total
 }
 
-/// AVX-512 with VPOPCNTDQ instruction (fastest)
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512vpopcntdq"))]
-#[target_feature(enable = "avx512f", enable = "avx512vpopcntdq")]
-unsafe fn hamming_avx512(a: &Fingerprint, b: &Fingerprint) -> u32 {
-    unsafe {
-        use std::arch::x86_64::*;
-
-        let a_ptr = a.as_raw().as_ptr();
-        let b_ptr = b.as_raw().as_ptr();
-
-        let mut sum = _mm512_setzero_si512();
-
-        // Process 8 u64 at a time (512 bits)
-        let mut i = 0;
-        while i + 8 <= FINGERPRINT_U64 {
-            let va = _mm512_loadu_si512(a_ptr.add(i) as *const __m512i);
-            let vb = _mm512_loadu_si512(b_ptr.add(i) as *const __m512i);
-            let xor = _mm512_xor_si512(va, vb);
-            let popcnt = _mm512_popcnt_epi64(xor);
-            sum = _mm512_add_epi64(sum, popcnt);
-            i += 8;
-        }
-
-        // Horizontal sum
-        let mut total = _mm512_reduce_add_epi64(sum) as u32;
-
-        // Handle remaining (256 % 8 = 0, no remainder at 16K)
-        while i < FINGERPRINT_U64 {
-            total += (*a_ptr.add(i) ^ *b_ptr.add(i)).count_ones();
-            i += 1;
-        }
-
-        total
-    }
-}
-
-/// AVX2 implementation (fallback for older x86_64)
-#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-#[target_feature(enable = "avx2")]
-unsafe fn hamming_avx2(a: &Fingerprint, b: &Fingerprint) -> u32 {
-    unsafe {
-        use std::arch::x86_64::*;
-
-        let a_ptr = a.as_raw().as_ptr();
-        let b_ptr = b.as_raw().as_ptr();
-
-        // Lookup table for 4-bit popcount
-        let lookup = _mm256_setr_epi8(
-            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2,
-            3, 3, 4,
-        );
-        let low_mask = _mm256_set1_epi8(0x0f);
-
-        let mut total_sum = _mm256_setzero_si256();
-
-        // Process 4 u64 at a time (256 bits)
-        let mut i = 0;
-        while i + 4 <= FINGERPRINT_U64 {
-            let va = _mm256_loadu_si256(a_ptr.add(i) as *const __m256i);
-            let vb = _mm256_loadu_si256(b_ptr.add(i) as *const __m256i);
-            let xor = _mm256_xor_si256(va, vb);
-
-            // Popcount via lookup table
-            let lo = _mm256_and_si256(xor, low_mask);
-            let hi = _mm256_and_si256(_mm256_srli_epi16(xor, 4), low_mask);
-            let popcnt_lo = _mm256_shuffle_epi8(lookup, lo);
-            let popcnt_hi = _mm256_shuffle_epi8(lookup, hi);
-            let popcnt = _mm256_add_epi8(popcnt_lo, popcnt_hi);
-
-            // Sum bytes
-            let sad = _mm256_sad_epu8(popcnt, _mm256_setzero_si256());
-            total_sum = _mm256_add_epi64(total_sum, sad);
-
-            i += 4;
-        }
-
-        // Horizontal sum
-        let sum_lo = _mm256_extracti128_si256(total_sum, 0);
-        let sum_hi = _mm256_extracti128_si256(total_sum, 1);
-        let sum128 = _mm_add_epi64(sum_lo, sum_hi);
-        let mut total = (_mm_extract_epi64(sum128, 0) + _mm_extract_epi64(sum128, 1)) as u32;
-
-        // Handle remaining
-        while i < FINGERPRINT_U64 {
-            total += (*a_ptr.add(i) ^ *b_ptr.add(i)).count_ones();
-            i += 1;
-        }
-
-        total
-    }
-}
-
-/// ARM NEON implementation
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-#[target_feature(enable = "neon")]
-unsafe fn hamming_neon(a: &Fingerprint, b: &Fingerprint) -> u32 {
-    unsafe {
-        use std::arch::aarch64::*;
-
-        let a_ptr = a.as_raw().as_ptr() as *const u8;
-        let b_ptr = b.as_raw().as_ptr() as *const u8;
-
-        let mut sum = vdupq_n_u64(0);
-
-        // Process 16 bytes at a time
-        let mut i = 0;
-        let byte_len = FINGERPRINT_U64 * 8;
-        while i + 16 <= byte_len {
-            let va = vld1q_u8(a_ptr.add(i));
-            let vb = vld1q_u8(b_ptr.add(i));
-            let xor = veorq_u8(va, vb);
-            let cnt = vcntq_u8(xor); // Count bits per byte
-
-            // Sum to 64-bit
-            let sum16 = vpaddlq_u8(cnt); // u8 -> u16
-            let sum32 = vpaddlq_u16(sum16); // u16 -> u32
-            let sum64 = vpaddlq_u32(sum32); // u32 -> u64
-            sum = vaddq_u64(sum, sum64);
-
-            i += 16;
-        }
-
-        // Horizontal sum
-        let mut total = (vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1)) as u32;
-
-        // Handle remaining bytes
-        while i < byte_len {
-            total += (*a_ptr.add(i) ^ *b_ptr.add(i)).count_ones();
-            i += 1;
-        }
-
-        total
-    }
-}
-
 /// Batch Hamming distance computation (parallel)
 #[cfg(feature = "parallel")]
 pub fn batch_hamming(query: &Fingerprint, corpus: &[Fingerprint]) -> Vec<u32> {