feat: U8x64 byte-level ops for palette codec, nibble, byte scan (Pumpkin/SD)

claude · claude · commit bad2a55160c9 · 2026-04-03T17:20:00.000Z
Added to all three tiers (AVX-512 / AVX2 / scalar): cmpeq_mask(other) → u64 — byte-wise equality, returns bitmask shr_epi16(imm) → Self — shift right 16-bit lanes (nibble extract) saturating_sub(other) — max(a-b, 0) per byte (delta subtraction) unpack_lo_epi8(other) — interleave low bytes (nibble interleave) unpack_hi_epi8(other) — interleave high bytes These operations are used by: palette_codec.rs — Minecraft-style variable-width bit packing nibble.rs — 4-bit light level packing (Pumpkin) byte_scan.rs — NBT format byte scanning (future) stable_diffusion/ — VAE latent palette encoding via GGUF All three are currently using raw _mm256_/_mm512_ intrinsics. Next step: rewire them to use crate::simd::U8x64 instead. https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp
diff --git a/src/simd.rs b/src/simd.rs
@@ -713,22 +713,51 @@ mod scalar {
         fn mul_assign(&mut self, rhs: Self) { *self = *self * rhs; }
     }
 
-    // U8x64 extra methods
+    // U8x64 extra methods — byte-level operations for palette codec, nibble, byte scan
     impl U8x64 {
         #[inline(always)]
         pub fn reduce_min(self) -> u8 { *self.0.iter().min().unwrap_or(&0) }
         #[inline(always)]
         pub fn reduce_max(self) -> u8 { *self.0.iter().max().unwrap_or(&0) }
         #[inline(always)]
         pub fn simd_min(self, other: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].min(other.0[i]); } Self(out)
+        }
+        #[inline(always)]
+        pub fn simd_max(self, other: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].max(other.0[i]); } Self(out)
+        }
+        #[inline(always)]
+        pub fn cmpeq_mask(self, other: Self) -> u64 {
+            let mut mask = 0u64;
+            for i in 0..64 { if self.0[i] == other.0[i] { mask |= 1u64 << i; } }
+            mask
+        }
+        #[inline(always)]
+        pub fn shr_epi16(self, imm: u32) -> Self {
             let mut out = [0u8; 64];
-            for i in 0..64 { out[i] = self.0[i].min(other.0[i]); }
+            for i in (0..64).step_by(2) {
+                let val = u16::from_le_bytes([self.0[i], self.0[i + 1]]);
+                let shifted = val >> imm;
+                let bytes = shifted.to_le_bytes();
+                out[i] = bytes[0]; out[i + 1] = bytes[1];
+            }
             Self(out)
         }
         #[inline(always)]
-        pub fn simd_max(self, other: Self) -> Self {
+        pub fn saturating_sub(self, other: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_sub(other.0[i]); } Self(out)
+        }
+        #[inline(always)]
+        pub fn unpack_lo_epi8(self, other: Self) -> Self {
+            let mut out = [0u8; 64];
+            for lane in 0..4 { let b = lane * 16; for i in 0..8 { out[b+i*2] = self.0[b+i]; out[b+i*2+1] = other.0[b+i]; } }
+            Self(out)
+        }
+        #[inline(always)]
+        pub fn unpack_hi_epi8(self, other: Self) -> Self {
             let mut out = [0u8; 64];
-            for i in 0..64 { out[i] = self.0[i].max(other.0[i]); }
+            for lane in 0..4 { let b = lane * 16; for i in 0..8 { out[b+i*2] = self.0[b+8+i]; out[b+i*2+1] = other.0[b+8+i]; } }
             Self(out)
         }
     }
diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs
@@ -761,6 +761,76 @@ macro_rules! avx2_int_type {
 }
 
 avx2_int_type!(U8x64, u8, 64, 0u8);
+
+// ── U8x64 byte-level operations (scalar fallback for AVX2 tier) ──────────
+// These match the AVX-512 U8x64 methods in simd_avx512.rs.
+impl U8x64 {
+    /// Byte-wise equality mask: bit i set if self[i] == other[i].
+    #[inline(always)]
+    pub fn cmpeq_mask(self, other: Self) -> u64 {
+        let mut mask = 0u64;
+        for i in 0..64 { if self.0[i] == other.0[i] { mask |= 1u64 << i; } }
+        mask
+    }
+
+    /// Shift right each 16-bit lane by imm bits (operates on pairs of u8 as u16).
+    #[inline(always)]
+    pub fn shr_epi16(self, imm: u32) -> Self {
+        let mut out = [0u8; 64];
+        for i in (0..64).step_by(2) {
+            let val = u16::from_le_bytes([self.0[i], self.0[i + 1]]);
+            let shifted = val >> imm;
+            let bytes = shifted.to_le_bytes();
+            out[i] = bytes[0];
+            out[i + 1] = bytes[1];
+        }
+        Self(out)
+    }
+
+    /// Saturating unsigned subtraction: max(a - b, 0) per byte.
+    #[inline(always)]
+    pub fn saturating_sub(self, other: Self) -> Self {
+        let mut out = [0u8; 64];
+        for i in 0..64 { out[i] = self.0[i].saturating_sub(other.0[i]); }
+        Self(out)
+    }
+
+    /// Interleave low bytes within each 128-bit lane.
+    #[inline(always)]
+    pub fn unpack_lo_epi8(self, other: Self) -> Self {
+        let mut out = [0u8; 64];
+        // Operates per 16-byte lane (4 lanes in 512-bit)
+        for lane in 0..4 {
+            let base = lane * 16;
+            for i in 0..8 {
+                out[base + i * 2] = self.0[base + i];
+                out[base + i * 2 + 1] = other.0[base + i];
+            }
+        }
+        Self(out)
+    }
+
+    /// Interleave high bytes within each 128-bit lane.
+    #[inline(always)]
+    pub fn unpack_hi_epi8(self, other: Self) -> Self {
+        let mut out = [0u8; 64];
+        for lane in 0..4 {
+            let base = lane * 16;
+            for i in 0..8 {
+                out[base + i * 2] = self.0[base + 8 + i];
+                out[base + i * 2 + 1] = other.0[base + 8 + i];
+            }
+        }
+        Self(out)
+    }
+
+    /// Reduce min/max (not in macro).
+    #[inline(always)] pub fn reduce_min(self) -> u8 { *self.0.iter().min().unwrap() }
+    #[inline(always)] pub fn reduce_max(self) -> u8 { *self.0.iter().max().unwrap() }
+    #[inline(always)] pub fn simd_min(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].min(other.0[i]); } Self(o) }
+    #[inline(always)] pub fn simd_max(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].max(other.0[i]); } Self(o) }
+}
+
 avx2_int_type!(I32x16, i32, 16, 0i32);
 avx2_int_type!(I64x8, i64, 8, 0i64);
 avx2_int_type!(U32x16, u32, 16, 0u32);
diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs
@@ -576,6 +576,53 @@ impl U8x64 {
     pub fn simd_max(self, other: Self) -> Self {
         Self(unsafe { _mm512_max_epu8(self.0, other.0) })
     }
+
+    // ── Byte-level operations for palette codec, nibble, byte scan ──────
+    // Reference: Pumpkin/Minecraft-derived modules (palette_codec.rs,
+    // nibble.rs, byte_scan.rs) use these for 4-bit packing and scanning.
+
+    /// Byte-wise equality comparison. Returns 64-bit mask: bit i set if a[i] == b[i].
+    #[inline(always)]
+    pub fn cmpeq_mask(self, other: Self) -> u64 {
+        unsafe { _mm512_cmpeq_epi8_mask(self.0, other.0) }
+    }
+
+    /// Shift right each 16-bit lane by immediate bits (for nibble extraction).
+    /// Note: operates on 16-bit lanes, not 8-bit — matches _mm512_srli_epi16.
+    #[inline(always)]
+    pub fn shr_epi16(self, imm: u32) -> Self {
+        // _mm512_srli_epi16 shifts each 16-bit lane right
+        // Use match for const immediate (intrinsic requires const)
+        Self(unsafe { match imm {
+            1 => _mm512_srli_epi16(self.0, 1),
+            2 => _mm512_srli_epi16(self.0, 2),
+            3 => _mm512_srli_epi16(self.0, 3),
+            4 => _mm512_srli_epi16(self.0, 4),
+            5 => _mm512_srli_epi16(self.0, 5),
+            6 => _mm512_srli_epi16(self.0, 6),
+            7 => _mm512_srli_epi16(self.0, 7),
+            8 => _mm512_srli_epi16(self.0, 8),
+            _ => _mm512_setzero_si512(),
+        }})
+    }
+
+    /// Saturating unsigned subtraction: max(a - b, 0) per byte.
+    #[inline(always)]
+    pub fn saturating_sub(self, other: Self) -> Self {
+        Self(unsafe { _mm512_subs_epu8(self.0, other.0) })
+    }
+
+    /// Interleave low bytes: [a0,b0,a1,b1,...] from lower halves.
+    #[inline(always)]
+    pub fn unpack_lo_epi8(self, other: Self) -> Self {
+        Self(unsafe { _mm512_unpacklo_epi8(self.0, other.0) })
+    }
+
+    /// Interleave high bytes: [a8,b8,a9,b9,...] from upper halves.
+    #[inline(always)]
+    pub fn unpack_hi_epi8(self, other: Self) -> Self {
+        Self(unsafe { _mm512_unpackhi_epi8(self.0, other.0) })
+    }
 }
 
 // u8 add/sub use AVX-512BW instructions