Skip to content

Commit 030e64c

Browse files
authored
Merge pull request #2014 from folkertdev/llvm-22-madd
Revert "Use LLVM intrinsics for `madd` intrinsics"
2 parents ca218e5 + 8a883ad commit 030e64c

3 files changed

Lines changed: 63 additions & 63 deletions

File tree

crates/core_arch/src/x86/avx2.rs

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1841,20 +1841,14 @@ pub const fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) ->
18411841
#[target_feature(enable = "avx2")]
18421842
#[cfg_attr(test, assert_instr(vpmaddwd))]
18431843
#[stable(feature = "simd_x86", since = "1.27.0")]
1844-
pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1845-
// It's a trick used in the Adler-32 algorithm to perform a widening addition.
1846-
//
1847-
// ```rust
1848-
// #[target_feature(enable = "avx2")]
1849-
// unsafe fn widening_add(mad: __m256i) -> __m256i {
1850-
// _mm256_madd_epi16(mad, _mm256_set1_epi16(1))
1851-
// }
1852-
// ```
1853-
//
1854-
// If we implement this using generic vector intrinsics, the optimizer
1855-
// will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
1856-
// For this reason, we use x86 intrinsics.
1857-
unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1844+
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1845+
pub const fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1846+
unsafe {
1847+
let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
1848+
let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
1849+
let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
1850+
simd_add(even, odd).as_m256i()
1851+
}
18581852
}
18591853

18601854
/// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -3819,8 +3813,6 @@ pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
38193813

38203814
#[allow(improper_ctypes)]
38213815
unsafe extern "C" {
3822-
#[link_name = "llvm.x86.avx2.pmadd.wd"]
3823-
fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
38243816
#[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
38253817
fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
38263818
#[link_name = "llvm.x86.avx2.mpsadbw"]
@@ -4669,14 +4661,24 @@ mod tests {
46694661
}
46704662

46714663
#[simd_test(enable = "avx2")]
4672-
fn test_mm256_madd_epi16() {
4664+
const fn test_mm256_madd_epi16() {
46734665
let a = _mm256_set1_epi16(2);
46744666
let b = _mm256_set1_epi16(4);
46754667
let r = _mm256_madd_epi16(a, b);
46764668
let e = _mm256_set1_epi32(16);
46774669
assert_eq_m256i(r, e);
46784670
}
46794671

4672+
#[target_feature(enable = "avx2")]
4673+
#[cfg_attr(test, assert_instr(vpmaddwd))]
4674+
unsafe fn test_mm256_madd_epi16_mul_one(mad: __m256i) -> __m256i {
4675+
// This is a trick used in the adler32 algorithm to get a widening addition. The
4676+
// multiplication by 1 is trivial, but must not be optimized out because then the vpmaddwd
4677+
// instruction is no longer selected. The assert_instr verifies that this is the case.
4678+
let one_v = _mm256_set1_epi16(1);
4679+
_mm256_madd_epi16(mad, one_v)
4680+
}
4681+
46804682
#[simd_test(enable = "avx2")]
46814683
const fn test_mm256_inserti128_si256() {
46824684
let a = _mm256_setr_epi64x(1, 2, 3, 4);

crates/core_arch/src/x86/avx512bw.rs

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6321,20 +6321,22 @@ pub const unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a:
63216321
#[target_feature(enable = "avx512bw")]
63226322
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63236323
#[cfg_attr(test, assert_instr(vpmaddwd))]
6324-
pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
6325-
// It's a trick used in the Adler-32 algorithm to perform a widening addition.
6326-
//
6327-
// ```rust
6328-
// #[target_feature(enable = "avx512bw")]
6329-
// unsafe fn widening_add(mad: __m512i) -> __m512i {
6330-
// _mm512_madd_epi16(mad, _mm512_set1_epi16(1))
6331-
// }
6332-
// ```
6333-
//
6334-
// If we implement this using generic vector intrinsics, the optimizer
6335-
// will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
6336-
// For this reason, we use x86 intrinsics.
6337-
unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
6324+
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6325+
pub const fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
6326+
unsafe {
6327+
let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
6328+
let even: i32x16 = simd_shuffle!(
6329+
r,
6330+
r,
6331+
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
6332+
);
6333+
let odd: i32x16 = simd_shuffle!(
6334+
r,
6335+
r,
6336+
[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
6337+
);
6338+
simd_add(even, odd).as_m512i()
6339+
}
63386340
}
63396341

63406342
/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6344,7 +6346,8 @@ pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
63446346
#[target_feature(enable = "avx512bw")]
63456347
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63466348
#[cfg_attr(test, assert_instr(vpmaddwd))]
6347-
pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
6349+
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6350+
pub const fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
63486351
unsafe {
63496352
let madd = _mm512_madd_epi16(a, b).as_i32x16();
63506353
transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
@@ -6358,7 +6361,8 @@ pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i
63586361
#[target_feature(enable = "avx512bw")]
63596362
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63606363
#[cfg_attr(test, assert_instr(vpmaddwd))]
6361-
pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
6364+
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6365+
pub const fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
63626366
unsafe {
63636367
let madd = _mm512_madd_epi16(a, b).as_i32x16();
63646368
transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
@@ -6372,7 +6376,8 @@ pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i
63726376
#[target_feature(enable = "avx512bw,avx512vl")]
63736377
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63746378
#[cfg_attr(test, assert_instr(vpmaddwd))]
6375-
pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
6379+
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6380+
pub const fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
63766381
unsafe {
63776382
let madd = _mm256_madd_epi16(a, b).as_i32x8();
63786383
transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
@@ -6386,7 +6391,8 @@ pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i)
63866391
#[target_feature(enable = "avx512bw,avx512vl")]
63876392
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63886393
#[cfg_attr(test, assert_instr(vpmaddwd))]
6389-
pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
6394+
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6395+
pub const fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
63906396
unsafe {
63916397
let madd = _mm256_madd_epi16(a, b).as_i32x8();
63926398
transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
@@ -6400,7 +6406,8 @@ pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
64006406
#[target_feature(enable = "avx512bw,avx512vl")]
64016407
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
64026408
#[cfg_attr(test, assert_instr(vpmaddwd))]
6403-
pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
6409+
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6410+
pub const fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
64046411
unsafe {
64056412
let madd = _mm_madd_epi16(a, b).as_i32x4();
64066413
transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
@@ -6414,7 +6421,8 @@ pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) ->
64146421
#[target_feature(enable = "avx512bw,avx512vl")]
64156422
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
64166423
#[cfg_attr(test, assert_instr(vpmaddwd))]
6417-
pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
6424+
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6425+
pub const fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
64186426
unsafe {
64196427
let madd = _mm_madd_epi16(a, b).as_i32x4();
64206428
transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
@@ -12574,8 +12582,6 @@ unsafe extern "C" {
1257412582
#[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
1257512583
fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
1257612584

12577-
#[link_name = "llvm.x86.avx512.pmaddw.d.512"]
12578-
fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
1257912585
#[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
1258012586
fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32;
1258112587

@@ -17500,7 +17506,7 @@ mod tests {
1750017506
}
1750117507

1750217508
#[simd_test(enable = "avx512bw")]
17503-
fn test_mm512_madd_epi16() {
17509+
const fn test_mm512_madd_epi16() {
1750417510
let a = _mm512_set1_epi16(1);
1750517511
let b = _mm512_set1_epi16(1);
1750617512
let r = _mm512_madd_epi16(a, b);
@@ -17509,7 +17515,7 @@ mod tests {
1750917515
}
1751017516

1751117517
#[simd_test(enable = "avx512bw")]
17512-
fn test_mm512_mask_madd_epi16() {
17518+
const fn test_mm512_mask_madd_epi16() {
1751317519
let a = _mm512_set1_epi16(1);
1751417520
let b = _mm512_set1_epi16(1);
1751517521
let r = _mm512_mask_madd_epi16(a, 0, a, b);
@@ -17537,7 +17543,7 @@ mod tests {
1753717543
}
1753817544

1753917545
#[simd_test(enable = "avx512bw")]
17540-
fn test_mm512_maskz_madd_epi16() {
17546+
const fn test_mm512_maskz_madd_epi16() {
1754117547
let a = _mm512_set1_epi16(1);
1754217548
let b = _mm512_set1_epi16(1);
1754317549
let r = _mm512_maskz_madd_epi16(0, a, b);
@@ -17548,7 +17554,7 @@ mod tests {
1754817554
}
1754917555

1755017556
#[simd_test(enable = "avx512bw,avx512vl")]
17551-
fn test_mm256_mask_madd_epi16() {
17557+
const fn test_mm256_mask_madd_epi16() {
1755217558
let a = _mm256_set1_epi16(1);
1755317559
let b = _mm256_set1_epi16(1);
1755417560
let r = _mm256_mask_madd_epi16(a, 0, a, b);
@@ -17568,7 +17574,7 @@ mod tests {
1756817574
}
1756917575

1757017576
#[simd_test(enable = "avx512bw,avx512vl")]
17571-
fn test_mm256_maskz_madd_epi16() {
17577+
const fn test_mm256_maskz_madd_epi16() {
1757217578
let a = _mm256_set1_epi16(1);
1757317579
let b = _mm256_set1_epi16(1);
1757417580
let r = _mm256_maskz_madd_epi16(0, a, b);
@@ -17579,7 +17585,7 @@ mod tests {
1757917585
}
1758017586

1758117587
#[simd_test(enable = "avx512bw,avx512vl")]
17582-
fn test_mm_mask_madd_epi16() {
17588+
const fn test_mm_mask_madd_epi16() {
1758317589
let a = _mm_set1_epi16(1);
1758417590
let b = _mm_set1_epi16(1);
1758517591
let r = _mm_mask_madd_epi16(a, 0, a, b);
@@ -17590,7 +17596,7 @@ mod tests {
1759017596
}
1759117597

1759217598
#[simd_test(enable = "avx512bw,avx512vl")]
17593-
fn test_mm_maskz_madd_epi16() {
17599+
const fn test_mm_maskz_madd_epi16() {
1759417600
let a = _mm_set1_epi16(1);
1759517601
let b = _mm_set1_epi16(1);
1759617602
let r = _mm_maskz_madd_epi16(0, a, b);

crates/core_arch/src/x86/sse2.rs

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -210,20 +210,14 @@ pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
210210
#[target_feature(enable = "sse2")]
211211
#[cfg_attr(test, assert_instr(pmaddwd))]
212212
#[stable(feature = "simd_x86", since = "1.27.0")]
213-
pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
214-
// It's a trick used in the Adler-32 algorithm to perform a widening addition.
215-
//
216-
// ```rust
217-
// #[target_feature(enable = "sse2")]
218-
// unsafe fn widening_add(mad: __m128i) -> __m128i {
219-
// _mm_madd_epi16(mad, _mm_set1_epi16(1))
220-
// }
221-
// ```
222-
//
223-
// If we implement this using generic vector intrinsics, the optimizer
224-
// will eliminate this pattern, and `pmaddwd` will no longer be emitted.
225-
// For this reason, we use x86 intrinsics.
226-
unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
213+
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
214+
pub const fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
215+
unsafe {
216+
let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
217+
let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
218+
let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
219+
simd_add(even, odd).as_m128i()
220+
}
227221
}
228222

229223
/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -3193,8 +3187,6 @@ unsafe extern "C" {
31933187
fn lfence();
31943188
#[link_name = "llvm.x86.sse2.mfence"]
31953189
fn mfence();
3196-
#[link_name = "llvm.x86.sse2.pmadd.wd"]
3197-
fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
31983190
#[link_name = "llvm.x86.sse2.psad.bw"]
31993191
fn psadbw(a: u8x16, b: u8x16) -> u64x2;
32003192
#[link_name = "llvm.x86.sse2.psll.w"]
@@ -3473,7 +3465,7 @@ mod tests {
34733465
}
34743466

34753467
#[simd_test(enable = "sse2")]
3476-
fn test_mm_madd_epi16() {
3468+
const fn test_mm_madd_epi16() {
34773469
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
34783470
let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
34793471
let r = _mm_madd_epi16(a, b);

0 commit comments

Comments
 (0)