feat: x86 simd implementation for i8x16_swizzle + i8x16_shuffle

explodingcamera · explodingcamera · commit 479b540436d8 · 2026-04-05T22:41:18.000+02:00
Signed-off-by: Henry &lt;mail@henrygressmann.de&gt;
diff --git a/crates/tinywasm/Cargo.toml b/crates/tinywasm/Cargo.toml
@@ -51,7 +51,7 @@ canonicalize_nans=[]
 # derive Debug for runtime/types structs
 debug=["tinywasm-types/debug"]
 
-# enable x86-specific SIMD intrinsics in Value128
+# enable x86-specific SIMD intrinsics in Value128 (uses unsafe code)
 # note: for x86 backend selection, compile with x86-64-v3 target features
 # (for example: `RUSTFLAGS="-C target-cpu=x86-64-v3"`)
 simd-x86=[]
diff --git a/crates/tinywasm/src/interpreter/simd/instructions.rs b/crates/tinywasm/src/interpreter/simd/instructions.rs
@@ -8,6 +8,20 @@ use super::super::no_std_floats::NoStdFloatExt;
 use core::arch::wasm32 as wasm;
 #[cfg(target_arch = "wasm64")]
 use core::arch::wasm64 as wasm;
+#[cfg(all(
+    feature = "simd-x86",
+    target_arch = "x86_64",
+    target_feature = "sse4.2",
+    target_feature = "avx",
+    target_feature = "avx2",
+    target_feature = "bmi1",
+    target_feature = "bmi2",
+    target_feature = "fma",
+    target_feature = "lzcnt",
+    target_feature = "movbe",
+    target_feature = "popcnt"
+))]
+use core::arch::x86_64 as x86;
 
 impl Value128 {
     #[doc(alias = "v128.any_true")]
@@ -132,20 +146,41 @@ impl Value128 {
 
     #[doc(alias = "i8x16.swizzle")]
     pub fn i8x16_swizzle(self, s: Self) -> Self {
-        #[cfg(any(target_arch = "wasm32", target_arch = "wasm64"))]
-        return Self::from_wasm_v128(wasm::i8x16_swizzle(self.to_wasm_v128(), s.to_wasm_v128()));
-
-        let a = self.to_le_bytes();
-        let idx = s.to_le_bytes();
-        let mut out = [0u8; 16];
-        let mut i = 0;
-        while i < 16 {
-            let j = idx[i];
-            let lane = a[(j & 0x0f) as usize];
-            out[i] = if j < 16 { lane } else { 0 };
-            i += 1;
+        simd_impl! {
+            wasm => { Self::from_wasm_v128(wasm::i8x16_swizzle(self.to_wasm_v128(), s.to_wasm_v128())) }
+            x86 => {
+                let a = self.to_le_bytes();
+                let idx = s.to_le_bytes();
+                let mut mask = [0u8; 16];
+                for i in 0..16 {
+                    let j = idx[i];
+                    mask[i] = if j < 16 { j & 0x0f } else { 0x80 };
+                }
+
+                // SAFETY: `a`, `mask`, and `out` are valid 16-byte buffers, and `_mm_loadu/_mm_storeu` support unaligned accesses.
+                #[allow(unsafe_code)]
+                let out = unsafe {
+                    let a_vec = x86::_mm_loadu_si128(a.as_ptr().cast::<x86::__m128i>());
+                    let mask_vec = x86::_mm_loadu_si128(mask.as_ptr().cast::<x86::__m128i>());
+                    let result = x86::_mm_shuffle_epi8(a_vec, mask_vec);
+                    let mut out = [0u8; 16];
+                    x86::_mm_storeu_si128(out.as_mut_ptr().cast::<x86::__m128i>(), result);
+                    out
+                };
+                Self::from_le_bytes(out)
+            }
+            generic => {
+                let a = self.to_le_bytes();
+                let idx = s.to_le_bytes();
+                let mut out = [0u8; 16];
+                for i in 0..16 {
+                    let j = idx[i];
+                    let lane = a[(j & 0x0f) as usize];
+                    out[i] = if j < 16 { lane } else { 0 };
+                }
+                Self::from_le_bytes(out)
+            }
         }
-        Self::from_le_bytes(out)
     }
 
     #[doc(alias = "i8x16.relaxed_swizzle")]
@@ -155,14 +190,45 @@ impl Value128 {
 
     #[doc(alias = "i8x16.shuffle")]
     pub fn i8x16_shuffle(a: Self, b: Self, idx: [u8; 16]) -> Self {
-        let mut src = [0u8; 32];
-        src[..16].copy_from_slice(&a.to_le_bytes());
-        src[16..].copy_from_slice(&b.to_le_bytes());
-        let mut out = [0u8; 16];
-        for i in 0..16 {
-            out[i] = src[(idx[i] & 31) as usize];
+        simd_impl! {
+            x86 => {
+                let a_bytes = a.to_le_bytes();
+                let b_bytes = b.to_le_bytes();
+                let mut mask_a = [0u8; 16];
+                let mut mask_b = [0u8; 16];
+                for i in 0..16 {
+                    let j = idx[i] & 31;
+                    mask_a[i] = if j < 16 { j } else { 0x80 };
+                    mask_b[i] = if j < 16 { 0x80 } else { j & 0x0f };
+                }
+
+                // SAFETY: all inputs are valid 16-byte buffers, and `_mm_loadu/_mm_storeu` support unaligned accesses.
+                #[allow(unsafe_code)]
+                let out = unsafe {
+                    let a_vec = x86::_mm_loadu_si128(a_bytes.as_ptr().cast::<x86::__m128i>());
+                    let b_vec = x86::_mm_loadu_si128(b_bytes.as_ptr().cast::<x86::__m128i>());
+                    let mask_a_vec = x86::_mm_loadu_si128(mask_a.as_ptr().cast::<x86::__m128i>());
+                    let mask_b_vec = x86::_mm_loadu_si128(mask_b.as_ptr().cast::<x86::__m128i>());
+                    let a_part = x86::_mm_shuffle_epi8(a_vec, mask_a_vec);
+                    let b_part = x86::_mm_shuffle_epi8(b_vec, mask_b_vec);
+                    let result = x86::_mm_or_si128(a_part, b_part);
+                    let mut out = [0u8; 16];
+                    x86::_mm_storeu_si128(out.as_mut_ptr().cast::<x86::__m128i>(), result);
+                    out
+                };
+                Self::from_le_bytes(out)
+            }
+            generic => {
+                let a_bytes = a.to_le_bytes();
+                let b_bytes = b.to_le_bytes();
+                let mut out = [0u8; 16];
+                for i in 0..16 {
+                    let j = idx[i] & 31;
+                    out[i] = if j < 16 { a_bytes[j as usize] } else { b_bytes[(j & 0x0f) as usize] };
+                }
+                Self::from_le_bytes(out)
+            }
         }
-        Self::from_le_bytes(out)
     }
 
     #[doc(alias = "i8x16.splat")]
diff --git a/crates/tinywasm/src/interpreter/simd/macros.rs b/crates/tinywasm/src/interpreter/simd/macros.rs
@@ -1,3 +1,5 @@
+#![allow(unused_macros)]
+
 macro_rules! simd_impl {
     ($(wasm => $wasm:block)? $(x86 => $x86:block)? generic => $generic:block) => {{
         #[cfg(any(target_arch = "wasm32", target_arch = "wasm64"))]
@@ -23,6 +25,7 @@ macro_rules! simd_impl {
             simd_impl!(@pick_x86 $( $x86 )? ; $generic)
         }
 
+        #[allow(unreachable_code)]
         #[cfg(not(any(
             any(target_arch = "wasm32", target_arch = "wasm64"),
             all(
diff --git a/crates/tinywasm/src/interpreter/simd/mod.rs b/crates/tinywasm/src/interpreter/simd/mod.rs
@@ -3,6 +3,8 @@
 #[macro_use]
 mod macros;
 mod instructions;
+#[cfg(test)]
+mod tests;
 mod utils;
 
 #[cfg(target_arch = "wasm32")]
diff --git a/crates/tinywasm/src/interpreter/simd/tests.rs b/crates/tinywasm/src/interpreter/simd/tests.rs
@@ -0,0 +1,60 @@
+use super::Value128;
+
+fn ref_swizzle(a: [u8; 16], idx: [u8; 16]) -> [u8; 16] {
+    let mut out = [0u8; 16];
+    for i in 0..16 {
+        let j = idx[i];
+        out[i] = if j < 16 { a[(j & 0x0f) as usize] } else { 0 };
+    }
+    out
+}
+
+fn ref_shuffle(a: [u8; 16], b: [u8; 16], idx: [u8; 16]) -> [u8; 16] {
+    let mut out = [0u8; 16];
+    for i in 0..16 {
+        let j = idx[i] & 31;
+        out[i] = if j < 16 { a[j as usize] } else { b[(j & 0x0f) as usize] };
+    }
+    out
+}
+
+#[test]
+fn swizzle_matches_reference() {
+    let a = [0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff];
+
+    for seed in 0u32..512 {
+        let mut s = [0u8; 16];
+        let mut x = seed.wrapping_mul(0x9e37_79b9).wrapping_add(0x7f4a_7c15);
+        for byte in &mut s {
+            x ^= x << 13;
+            x ^= x >> 17;
+            x ^= x << 5;
+            *byte = (x & 0xff) as u8;
+        }
+
+        let got = Value128::from_le_bytes(a).i8x16_swizzle(Value128::from_le_bytes(s)).to_le_bytes();
+        let expected = ref_swizzle(a, s);
+        assert_eq!(got, expected, "seed={seed}");
+    }
+}
+
+#[test]
+fn shuffle_matches_reference() {
+    let a = [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f];
+    let b = [0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf];
+
+    for seed in 0u32..512 {
+        let mut idx = [0u8; 16];
+        let mut x = seed.wrapping_mul(0x85eb_ca6b).wrapping_add(0xc2b2_ae35);
+        for byte in &mut idx {
+            x ^= x << 13;
+            x ^= x >> 17;
+            x ^= x << 5;
+            *byte = (x & 0xff) as u8;
+        }
+
+        let got = Value128::i8x16_shuffle(Value128::from_le_bytes(a), Value128::from_le_bytes(b), idx).to_le_bytes();
+        let expected = ref_shuffle(a, b, idx);
+        assert_eq!(got, expected, "seed={seed}");
+    }
+}
diff --git a/crates/tinywasm/src/interpreter/simd/utils.rs b/crates/tinywasm/src/interpreter/simd/utils.rs
@@ -1,5 +1,8 @@
 use super::Value128;
 
+#[cfg(not(feature = "std"))]
+use crate::interpreter::no_std_floats::NoStdFloatExt;
+
 impl Value128 {
     pub(super) fn extract_lane_bytes<const LANE_BYTES: usize>(self, lane: u8, lane_count: u8) -> [u8; LANE_BYTES] {
         debug_assert!(lane < lane_count);
diff --git a/crates/tinywasm/src/lib.rs b/crates/tinywasm/src/lib.rs
@@ -4,7 +4,8 @@
     attr(deny(warnings, rust_2018_idioms), allow(dead_code, unused_assignments, unused_variables))
 ))]
 #![warn(missing_docs, rust_2018_idioms, unreachable_pub)]
-#![cfg_attr(not(feature = "simd-x86"), deny(unsafe_code))]
+#![cfg_attr(not(feature = "simd-x86"), forbid(unsafe_code))]
+#![cfg_attr(feature = "simd-x86", deny(unsafe_code))]
 
 //! A tiny WebAssembly Runtime written in Rust
 //!