Skip to content

Commit 479b540

Browse files
feat: x86 simd implementation for i8x16_swizzle + i8x16_shuffle
Signed-off-by: Henry <mail@henrygressmann.de>
1 parent 63e33a2 commit 479b540

File tree

7 files changed

+157
-22
lines changed

7 files changed

+157
-22
lines changed

crates/tinywasm/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ canonicalize_nans=[]
5151
# derive Debug for runtime/types structs
5252
debug=["tinywasm-types/debug"]
5353

54-
# enable x86-specific SIMD intrinsics in Value128
54+
# enable x86-specific SIMD intrinsics in Value128 (uses unsafe code)
5555
# note: for x86 backend selection, compile with x86-64-v3 target features
5656
# (for example: `RUSTFLAGS="-C target-cpu=x86-64-v3"`)
5757
simd-x86=[]

crates/tinywasm/src/interpreter/simd/instructions.rs

Lines changed: 86 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,20 @@ use super::super::no_std_floats::NoStdFloatExt;
88
use core::arch::wasm32 as wasm;
99
#[cfg(target_arch = "wasm64")]
1010
use core::arch::wasm64 as wasm;
11+
#[cfg(all(
12+
feature = "simd-x86",
13+
target_arch = "x86_64",
14+
target_feature = "sse4.2",
15+
target_feature = "avx",
16+
target_feature = "avx2",
17+
target_feature = "bmi1",
18+
target_feature = "bmi2",
19+
target_feature = "fma",
20+
target_feature = "lzcnt",
21+
target_feature = "movbe",
22+
target_feature = "popcnt"
23+
))]
24+
use core::arch::x86_64 as x86;
1125

1226
impl Value128 {
1327
#[doc(alias = "v128.any_true")]
@@ -132,20 +146,41 @@ impl Value128 {
132146

133147
#[doc(alias = "i8x16.swizzle")]
134148
pub fn i8x16_swizzle(self, s: Self) -> Self {
135-
#[cfg(any(target_arch = "wasm32", target_arch = "wasm64"))]
136-
return Self::from_wasm_v128(wasm::i8x16_swizzle(self.to_wasm_v128(), s.to_wasm_v128()));
137-
138-
let a = self.to_le_bytes();
139-
let idx = s.to_le_bytes();
140-
let mut out = [0u8; 16];
141-
let mut i = 0;
142-
while i < 16 {
143-
let j = idx[i];
144-
let lane = a[(j & 0x0f) as usize];
145-
out[i] = if j < 16 { lane } else { 0 };
146-
i += 1;
149+
simd_impl! {
150+
wasm => { Self::from_wasm_v128(wasm::i8x16_swizzle(self.to_wasm_v128(), s.to_wasm_v128())) }
151+
x86 => {
152+
let a = self.to_le_bytes();
153+
let idx = s.to_le_bytes();
154+
let mut mask = [0u8; 16];
155+
for i in 0..16 {
156+
let j = idx[i];
157+
mask[i] = if j < 16 { j & 0x0f } else { 0x80 };
158+
}
159+
160+
// SAFETY: `a`, `mask`, and `out` are valid 16-byte buffers, and `_mm_loadu/_mm_storeu` support unaligned accesses.
161+
#[allow(unsafe_code)]
162+
let out = unsafe {
163+
let a_vec = x86::_mm_loadu_si128(a.as_ptr().cast::<x86::__m128i>());
164+
let mask_vec = x86::_mm_loadu_si128(mask.as_ptr().cast::<x86::__m128i>());
165+
let result = x86::_mm_shuffle_epi8(a_vec, mask_vec);
166+
let mut out = [0u8; 16];
167+
x86::_mm_storeu_si128(out.as_mut_ptr().cast::<x86::__m128i>(), result);
168+
out
169+
};
170+
Self::from_le_bytes(out)
171+
}
172+
generic => {
173+
let a = self.to_le_bytes();
174+
let idx = s.to_le_bytes();
175+
let mut out = [0u8; 16];
176+
for i in 0..16 {
177+
let j = idx[i];
178+
let lane = a[(j & 0x0f) as usize];
179+
out[i] = if j < 16 { lane } else { 0 };
180+
}
181+
Self::from_le_bytes(out)
182+
}
147183
}
148-
Self::from_le_bytes(out)
149184
}
150185

151186
#[doc(alias = "i8x16.relaxed_swizzle")]
@@ -155,14 +190,45 @@ impl Value128 {
155190

156191
#[doc(alias = "i8x16.shuffle")]
157192
pub fn i8x16_shuffle(a: Self, b: Self, idx: [u8; 16]) -> Self {
158-
let mut src = [0u8; 32];
159-
src[..16].copy_from_slice(&a.to_le_bytes());
160-
src[16..].copy_from_slice(&b.to_le_bytes());
161-
let mut out = [0u8; 16];
162-
for i in 0..16 {
163-
out[i] = src[(idx[i] & 31) as usize];
193+
simd_impl! {
194+
x86 => {
195+
let a_bytes = a.to_le_bytes();
196+
let b_bytes = b.to_le_bytes();
197+
let mut mask_a = [0u8; 16];
198+
let mut mask_b = [0u8; 16];
199+
for i in 0..16 {
200+
let j = idx[i] & 31;
201+
mask_a[i] = if j < 16 { j } else { 0x80 };
202+
mask_b[i] = if j < 16 { 0x80 } else { j & 0x0f };
203+
}
204+
205+
// SAFETY: all inputs are valid 16-byte buffers, and `_mm_loadu/_mm_storeu` support unaligned accesses.
206+
#[allow(unsafe_code)]
207+
let out = unsafe {
208+
let a_vec = x86::_mm_loadu_si128(a_bytes.as_ptr().cast::<x86::__m128i>());
209+
let b_vec = x86::_mm_loadu_si128(b_bytes.as_ptr().cast::<x86::__m128i>());
210+
let mask_a_vec = x86::_mm_loadu_si128(mask_a.as_ptr().cast::<x86::__m128i>());
211+
let mask_b_vec = x86::_mm_loadu_si128(mask_b.as_ptr().cast::<x86::__m128i>());
212+
let a_part = x86::_mm_shuffle_epi8(a_vec, mask_a_vec);
213+
let b_part = x86::_mm_shuffle_epi8(b_vec, mask_b_vec);
214+
let result = x86::_mm_or_si128(a_part, b_part);
215+
let mut out = [0u8; 16];
216+
x86::_mm_storeu_si128(out.as_mut_ptr().cast::<x86::__m128i>(), result);
217+
out
218+
};
219+
Self::from_le_bytes(out)
220+
}
221+
generic => {
222+
let a_bytes = a.to_le_bytes();
223+
let b_bytes = b.to_le_bytes();
224+
let mut out = [0u8; 16];
225+
for i in 0..16 {
226+
let j = idx[i] & 31;
227+
out[i] = if j < 16 { a_bytes[j as usize] } else { b_bytes[(j & 0x0f) as usize] };
228+
}
229+
Self::from_le_bytes(out)
230+
}
164231
}
165-
Self::from_le_bytes(out)
166232
}
167233

168234
#[doc(alias = "i8x16.splat")]

crates/tinywasm/src/interpreter/simd/macros.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#![allow(unused_macros)]
2+
13
macro_rules! simd_impl {
24
($(wasm => $wasm:block)? $(x86 => $x86:block)? generic => $generic:block) => {{
35
#[cfg(any(target_arch = "wasm32", target_arch = "wasm64"))]
@@ -23,6 +25,7 @@ macro_rules! simd_impl {
2325
simd_impl!(@pick_x86 $( $x86 )? ; $generic)
2426
}
2527

28+
#[allow(unreachable_code)]
2629
#[cfg(not(any(
2730
any(target_arch = "wasm32", target_arch = "wasm64"),
2831
all(

crates/tinywasm/src/interpreter/simd/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#[macro_use]
44
mod macros;
55
mod instructions;
6+
#[cfg(test)]
7+
mod tests;
68
mod utils;
79

810
#[cfg(target_arch = "wasm32")]
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
use super::Value128;
2+
3+
fn ref_swizzle(a: [u8; 16], idx: [u8; 16]) -> [u8; 16] {
4+
let mut out = [0u8; 16];
5+
for i in 0..16 {
6+
let j = idx[i];
7+
out[i] = if j < 16 { a[(j & 0x0f) as usize] } else { 0 };
8+
}
9+
out
10+
}
11+
12+
fn ref_shuffle(a: [u8; 16], b: [u8; 16], idx: [u8; 16]) -> [u8; 16] {
13+
let mut out = [0u8; 16];
14+
for i in 0..16 {
15+
let j = idx[i] & 31;
16+
out[i] = if j < 16 { a[j as usize] } else { b[(j & 0x0f) as usize] };
17+
}
18+
out
19+
}
20+
21+
#[test]
22+
fn swizzle_matches_reference() {
23+
let a = [0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff];
24+
25+
for seed in 0u32..512 {
26+
let mut s = [0u8; 16];
27+
let mut x = seed.wrapping_mul(0x9e37_79b9).wrapping_add(0x7f4a_7c15);
28+
for byte in &mut s {
29+
x ^= x << 13;
30+
x ^= x >> 17;
31+
x ^= x << 5;
32+
*byte = (x & 0xff) as u8;
33+
}
34+
35+
let got = Value128::from_le_bytes(a).i8x16_swizzle(Value128::from_le_bytes(s)).to_le_bytes();
36+
let expected = ref_swizzle(a, s);
37+
assert_eq!(got, expected, "seed={seed}");
38+
}
39+
}
40+
41+
#[test]
42+
fn shuffle_matches_reference() {
43+
let a = [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f];
44+
let b = [0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf];
45+
46+
for seed in 0u32..512 {
47+
let mut idx = [0u8; 16];
48+
let mut x = seed.wrapping_mul(0x85eb_ca6b).wrapping_add(0xc2b2_ae35);
49+
for byte in &mut idx {
50+
x ^= x << 13;
51+
x ^= x >> 17;
52+
x ^= x << 5;
53+
*byte = (x & 0xff) as u8;
54+
}
55+
56+
let got = Value128::i8x16_shuffle(Value128::from_le_bytes(a), Value128::from_le_bytes(b), idx).to_le_bytes();
57+
let expected = ref_shuffle(a, b, idx);
58+
assert_eq!(got, expected, "seed={seed}");
59+
}
60+
}

crates/tinywasm/src/interpreter/simd/utils.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
use super::Value128;
22

3+
#[cfg(not(feature = "std"))]
4+
use crate::interpreter::no_std_floats::NoStdFloatExt;
5+
36
impl Value128 {
47
pub(super) fn extract_lane_bytes<const LANE_BYTES: usize>(self, lane: u8, lane_count: u8) -> [u8; LANE_BYTES] {
58
debug_assert!(lane < lane_count);

crates/tinywasm/src/lib.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
attr(deny(warnings, rust_2018_idioms), allow(dead_code, unused_assignments, unused_variables))
55
))]
66
#![warn(missing_docs, rust_2018_idioms, unreachable_pub)]
7-
#![cfg_attr(not(feature = "simd-x86"), deny(unsafe_code))]
7+
#![cfg_attr(not(feature = "simd-x86"), forbid(unsafe_code))]
8+
#![cfg_attr(feature = "simd-x86", deny(unsafe_code))]
89

910
//! A tiny WebAssembly Runtime written in Rust
1011
//!

0 commit comments

Comments
 (0)