Skip to content

Commit ca218e5

Browse files
authored
Merge pull request #2012 from folkertdev/llvm-22-hadds-hsubs
x86: use `intrinsics::simd` for `hadds`/`hsubs`
2 parents 6b17a2a + e940b04 commit ca218e5

2 files changed

Lines changed: 44 additions & 14 deletions

File tree

crates/core_arch/src/x86/avx2.rs

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -991,7 +991,21 @@ pub const fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
991991
#[cfg_attr(test, assert_instr(vphaddsw))]
992992
#[stable(feature = "simd_x86", since = "1.27.0")]
993993
pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
994-
unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
994+
let a = a.as_i16x16();
995+
let b = b.as_i16x16();
996+
unsafe {
997+
let even: i16x16 = simd_shuffle!(
998+
a,
999+
b,
1000+
[0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1001+
);
1002+
let odd: i16x16 = simd_shuffle!(
1003+
a,
1004+
b,
1005+
[1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1006+
);
1007+
simd_saturating_add(even, odd).as_m256i()
1008+
}
9951009
}
9961010

9971011
/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
@@ -1047,7 +1061,21 @@ pub const fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
10471061
#[cfg_attr(test, assert_instr(vphsubsw))]
10481062
#[stable(feature = "simd_x86", since = "1.27.0")]
10491063
pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1050-
unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
1064+
let a = a.as_i16x16();
1065+
let b = b.as_i16x16();
1066+
unsafe {
1067+
let even: i16x16 = simd_shuffle!(
1068+
a,
1069+
b,
1070+
[0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1071+
);
1072+
let odd: i16x16 = simd_shuffle!(
1073+
a,
1074+
b,
1075+
[1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1076+
);
1077+
simd_saturating_sub(even, odd).as_m256i()
1078+
}
10511079
}
10521080

10531081
/// Returns values from `slice` at offsets determined by `offsets * scale`,
@@ -3791,10 +3819,6 @@ pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
37913819

37923820
#[allow(improper_ctypes)]
37933821
unsafe extern "C" {
3794-
#[link_name = "llvm.x86.avx2.phadd.sw"]
3795-
fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3796-
#[link_name = "llvm.x86.avx2.phsub.sw"]
3797-
fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
37983822
#[link_name = "llvm.x86.avx2.pmadd.wd"]
37993823
fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
38003824
#[link_name = "llvm.x86.avx2.pmadd.ub.sw"]

crates/core_arch/src/x86/ssse3.rs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,13 @@ pub const fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
188188
#[cfg_attr(test, assert_instr(phaddsw))]
189189
#[stable(feature = "simd_x86", since = "1.27.0")]
190190
pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
191-
unsafe { transmute(phaddsw128(a.as_i16x8(), b.as_i16x8())) }
191+
let a = a.as_i16x8();
192+
let b = b.as_i16x8();
193+
unsafe {
194+
let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
195+
let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
196+
simd_saturating_add(even, odd).as_m128i()
197+
}
192198
}
193199

194200
/// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -240,7 +246,13 @@ pub const fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
240246
#[cfg_attr(test, assert_instr(phsubsw))]
241247
#[stable(feature = "simd_x86", since = "1.27.0")]
242248
pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
243-
unsafe { transmute(phsubsw128(a.as_i16x8(), b.as_i16x8())) }
249+
let a = a.as_i16x8();
250+
let b = b.as_i16x8();
251+
unsafe {
252+
let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
253+
let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
254+
simd_saturating_sub(even, odd).as_m128i()
255+
}
244256
}
245257

246258
/// Horizontally subtract the adjacent pairs of values contained in 2
@@ -337,12 +349,6 @@ unsafe extern "C" {
337349
#[link_name = "llvm.x86.ssse3.pshuf.b.128"]
338350
fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
339351

340-
#[link_name = "llvm.x86.ssse3.phadd.sw.128"]
341-
fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
342-
343-
#[link_name = "llvm.x86.ssse3.phsub.sw.128"]
344-
fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
345-
346352
#[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
347353
fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;
348354

0 commit comments

Comments
 (0)