Merge pull request #76 from AdaWorldAPI/claude/setup-embedding-pipeline-Fa65C

AdaWorldAPI · web-flow · commit 8ba065ca1a7b · 2026-04-03T19:42:02.000+02:00
feat: U8x64 byte-level ops for palette codec, nibble, byte scan (Pumpkin/SD) Added to all three tiers (AVX-512 / AVX2 / scalar): cmpeq_mask(other) → u64 — byte-wise equality, returns bitmask shr_epi16(imm) → Self — shift right 16-bit lanes (nibble extract) saturating_sub(other) — max(a-b, 0) per byte (delta subtraction) unpack_lo_epi8(other) — interleave low bytes (nibble interleave) unpack_hi_epi8(other) — interleave high bytes These operations are used by: palette_codec.rs — Minecraft-style variable-width bit packing nibble.rs — 4-bit light level packing (Pumpkin) byte_scan.rs — NBT format byte scanning (future) stable_diffusion/ — VAE latent palette encoding via GGUF All three are currently using raw _mm256_/_mm512_ intrinsics. Next step: rewire them to use crate::simd::U8x64 instead. https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp
diff --git a/src/simd.rs b/src/simd.rs
@@ -22,6 +22,64 @@ static TIER: LazyLock<Tier> = LazyLock::new(|| {
 #[inline(always)]
 fn tier() -> Tier { *TIER }
 
+// BF16 tier detection happens inline in bf16_to_f32_batch() via
+// is_x86_feature_detected!("avx512bf16") — no LazyLock needed.
+// The check is cheap (reads a cached cpuid result) and the batch
+// function uses as_chunks::<16>() + as_chunks::<8>() for SIMD widths.
+
+// ============================================================================
+// Preferred SIMD lane widths — compile-time constants for array_windows
+// ============================================================================
+//
+// Consumer code uses these to select array_windows size at compile time:
+//
+//   for window in data.array_windows::<{crate::simd::PREFERRED_F64_LANES}>() {
+//       let v = F64x8::from_array(*window);   // AVX-512: native 8-wide
+//       // or
+//       let v = F64x4::from_array(*window);   // AVX2: native 4-wide
+//   }
+//
+// generic_const_exprs is nightly, so consumers must #[cfg] branch on window size.
+// These constants document the preferred width per tier.
+
+/// Preferred f64 SIMD width (elements per register).
+/// AVX-512: 8 lanes (__m512d). AVX2/scalar: 4 lanes (__m256d).
+#[cfg(target_feature = "avx512f")]
+pub const PREFERRED_F64_LANES: usize = 8;
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+pub const PREFERRED_F64_LANES: usize = 4;
+#[cfg(not(target_arch = "x86_64"))]
+pub const PREFERRED_F64_LANES: usize = 4; // scalar fallback: same as AVX2 shape
+
+/// Preferred f32 SIMD width.
+/// AVX-512: 16 lanes (__m512). AVX2/scalar: 8 lanes (__m256).
+#[cfg(target_feature = "avx512f")]
+pub const PREFERRED_F32_LANES: usize = 16;
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+pub const PREFERRED_F32_LANES: usize = 8;
+#[cfg(not(target_arch = "x86_64"))]
+pub const PREFERRED_F32_LANES: usize = 8;
+
+/// Preferred u64 SIMD width.
+/// AVX-512: 8 lanes. AVX2/scalar: 4 lanes.
+#[cfg(target_feature = "avx512f")]
+pub const PREFERRED_U64_LANES: usize = 8;
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+pub const PREFERRED_U64_LANES: usize = 4;
+#[cfg(not(target_arch = "x86_64"))]
+pub const PREFERRED_U64_LANES: usize = 4;
+
+/// Preferred i16 SIMD width (for Base17 L1 on i16[17]).
+/// AVX-512: 32 lanes (__m512i via epi16). AVX2: 16 lanes (__m256i).
+/// Base17 has 17 dims — AVX-512 covers 32 (load 17 + 15 padding),
+/// AVX2 covers 16 + 1 scalar.
+#[cfg(target_feature = "avx512f")]
+pub const PREFERRED_I16_LANES: usize = 32;
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+pub const PREFERRED_I16_LANES: usize = 16;
+#[cfg(not(target_arch = "x86_64"))]
+pub const PREFERRED_I16_LANES: usize = 16;
+
 // ============================================================================
 // x86_64: re-export based on tier
 // ============================================================================
@@ -41,6 +99,16 @@ pub use crate::simd_avx512::{
     f32x16, f64x8, u8x64, i32x16, i64x8, u32x16, u64x8,
 };
 
+// BF16 types + batch conversion (always available — scalar fallback built in)
+#[cfg(target_arch = "x86_64")]
+pub use crate::simd_avx512::{
+    bf16_to_f32_scalar, f32_to_bf16_scalar,
+    bf16_to_f32_batch, f32_to_bf16_batch,
+};
+// BF16 SIMD types only available when avx512bf16 is enabled at compile time
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))]
+pub use crate::simd_avx512::{BF16x16, BF16x8};
+
 #[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
 pub use crate::simd_avx512::{F32x8, F64x4, f32x8, f64x4};
 
@@ -645,22 +713,51 @@ mod scalar {
         fn mul_assign(&mut self, rhs: Self) { *self = *self * rhs; }
     }
 
-    // U8x64 extra methods
+    // U8x64 extra methods — byte-level operations for palette codec, nibble, byte scan
     impl U8x64 {
         #[inline(always)]
         pub fn reduce_min(self) -> u8 { *self.0.iter().min().unwrap_or(&0) }
         #[inline(always)]
         pub fn reduce_max(self) -> u8 { *self.0.iter().max().unwrap_or(&0) }
         #[inline(always)]
         pub fn simd_min(self, other: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].min(other.0[i]); } Self(out)
+        }
+        #[inline(always)]
+        pub fn simd_max(self, other: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].max(other.0[i]); } Self(out)
+        }
+        #[inline(always)]
+        pub fn cmpeq_mask(self, other: Self) -> u64 {
+            let mut mask = 0u64;
+            for i in 0..64 { if self.0[i] == other.0[i] { mask |= 1u64 << i; } }
+            mask
+        }
+        #[inline(always)]
+        pub fn shr_epi16(self, imm: u32) -> Self {
             let mut out = [0u8; 64];
-            for i in 0..64 { out[i] = self.0[i].min(other.0[i]); }
+            for i in (0..64).step_by(2) {
+                let val = u16::from_le_bytes([self.0[i], self.0[i + 1]]);
+                let shifted = val >> imm;
+                let bytes = shifted.to_le_bytes();
+                out[i] = bytes[0]; out[i + 1] = bytes[1];
+            }
             Self(out)
         }
         #[inline(always)]
-        pub fn simd_max(self, other: Self) -> Self {
+        pub fn saturating_sub(self, other: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_sub(other.0[i]); } Self(out)
+        }
+        #[inline(always)]
+        pub fn unpack_lo_epi8(self, other: Self) -> Self {
             let mut out = [0u8; 64];
-            for i in 0..64 { out[i] = self.0[i].max(other.0[i]); }
+            for lane in 0..4 { let b = lane * 16; for i in 0..8 { out[b+i*2] = self.0[b+i]; out[b+i*2+1] = other.0[b+i]; } }
+            Self(out)
+        }
+        #[inline(always)]
+        pub fn unpack_hi_epi8(self, other: Self) -> Self {
+            let mut out = [0u8; 64];
+            for lane in 0..4 { let b = lane * 16; for i in 0..8 { out[b+i*2] = self.0[b+8+i]; out[b+i*2+1] = other.0[b+8+i]; } }
             Self(out)
         }
     }
@@ -697,6 +794,20 @@ pub use scalar::{
     f32x8, f64x4,
 };
 
+// Scalar BF16 conversion — always available on all platforms
+#[cfg(not(target_arch = "x86_64"))]
+pub fn bf16_to_f32_scalar(bits: u16) -> f32 { f32::from_bits((bits as u32) << 16) }
+#[cfg(not(target_arch = "x86_64"))]
+pub fn f32_to_bf16_scalar(v: f32) -> u16 { (v.to_bits() >> 16) as u16 }
+#[cfg(not(target_arch = "x86_64"))]
+pub fn bf16_to_f32_batch(input: &[u16], output: &mut [f32]) {
+    for (i, &b) in input.iter().enumerate() { if i < output.len() { output[i] = bf16_to_f32_scalar(b); } }
+}
+#[cfg(not(target_arch = "x86_64"))]
+pub fn f32_to_bf16_batch(input: &[f32], output: &mut [u16]) {
+    for (i, &v) in input.iter().enumerate() { if i < output.len() { output[i] = f32_to_bf16_scalar(v); } }
+}
+
 // ============================================================================
 // SIMD math functions — ndarray additions (not in std::simd)
 // ============================================================================
diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs
@@ -761,6 +761,76 @@ macro_rules! avx2_int_type {
 }
 
 avx2_int_type!(U8x64, u8, 64, 0u8);
+
+// ── U8x64 byte-level operations (scalar fallback for AVX2 tier) ──────────
+// These match the AVX-512 U8x64 methods in simd_avx512.rs.
+impl U8x64 {
+    /// Byte-wise equality mask: bit i set if self[i] == other[i].
+    #[inline(always)]
+    pub fn cmpeq_mask(self, other: Self) -> u64 {
+        let mut mask = 0u64;
+        for i in 0..64 { if self.0[i] == other.0[i] { mask |= 1u64 << i; } }
+        mask
+    }
+
+    /// Shift right each 16-bit lane by imm bits (operates on pairs of u8 as u16).
+    #[inline(always)]
+    pub fn shr_epi16(self, imm: u32) -> Self {
+        let mut out = [0u8; 64];
+        for i in (0..64).step_by(2) {
+            let val = u16::from_le_bytes([self.0[i], self.0[i + 1]]);
+            let shifted = val >> imm;
+            let bytes = shifted.to_le_bytes();
+            out[i] = bytes[0];
+            out[i + 1] = bytes[1];
+        }
+        Self(out)
+    }
+
+    /// Saturating unsigned subtraction: max(a - b, 0) per byte.
+    #[inline(always)]
+    pub fn saturating_sub(self, other: Self) -> Self {
+        let mut out = [0u8; 64];
+        for i in 0..64 { out[i] = self.0[i].saturating_sub(other.0[i]); }
+        Self(out)
+    }
+
+    /// Interleave low bytes within each 128-bit lane.
+    #[inline(always)]
+    pub fn unpack_lo_epi8(self, other: Self) -> Self {
+        let mut out = [0u8; 64];
+        // Operates per 16-byte lane (4 lanes in 512-bit)
+        for lane in 0..4 {
+            let base = lane * 16;
+            for i in 0..8 {
+                out[base + i * 2] = self.0[base + i];
+                out[base + i * 2 + 1] = other.0[base + i];
+            }
+        }
+        Self(out)
+    }
+
+    /// Interleave high bytes within each 128-bit lane.
+    #[inline(always)]
+    pub fn unpack_hi_epi8(self, other: Self) -> Self {
+        let mut out = [0u8; 64];
+        for lane in 0..4 {
+            let base = lane * 16;
+            for i in 0..8 {
+                out[base + i * 2] = self.0[base + 8 + i];
+                out[base + i * 2 + 1] = other.0[base + 8 + i];
+            }
+        }
+        Self(out)
+    }
+
+    /// Reduce min/max (not in macro).
+    #[inline(always)] pub fn reduce_min(self) -> u8 { *self.0.iter().min().unwrap() }
+    #[inline(always)] pub fn reduce_max(self) -> u8 { *self.0.iter().max().unwrap() }
+    #[inline(always)] pub fn simd_min(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].min(other.0[i]); } Self(o) }
+    #[inline(always)] pub fn simd_max(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].max(other.0[i]); } Self(o) }
+}
+
 avx2_int_type!(I32x16, i32, 16, 0i32);
 avx2_int_type!(I64x8, i64, 8, 0i64);
 avx2_int_type!(U32x16, u32, 16, 0u32);
diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs