Skip to content

Commit 6d61dc3

Browse files
authored
Merge pull request #2018 from rust-lang/revert-2014-llvm-22-madd
Revert "Revert "Use LLVM intrinsics for `madd` intrinsics""
2 parents 212637b + 560d922 commit 6d61dc3

3 files changed

Lines changed: 75 additions & 55 deletions

File tree

crates/core_arch/src/x86/avx2.rs

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1841,14 +1841,20 @@ pub const fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) ->
18411841
#[target_feature(enable = "avx2")]
18421842
#[cfg_attr(test, assert_instr(vpmaddwd))]
18431843
#[stable(feature = "simd_x86", since = "1.27.0")]
1844-
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1845-
pub const fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1846-
unsafe {
1847-
let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
1848-
let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
1849-
let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
1850-
simd_add(even, odd).as_m256i()
1851-
}
1844+
pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1845+
// It's a trick used in the Adler-32 algorithm to perform a widening addition.
1846+
//
1847+
// ```rust
1848+
// #[target_feature(enable = "avx2")]
1849+
// unsafe fn widening_add(mad: __m256i) -> __m256i {
1850+
// _mm256_madd_epi16(mad, _mm256_set1_epi16(1))
1851+
// }
1852+
// ```
1853+
//
1854+
// If we implement this using generic vector intrinsics, the optimizer
1855+
// will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
1856+
// For this reason, we use x86 intrinsics.
1857+
unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
18521858
}
18531859

18541860
/// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -3813,6 +3819,8 @@ pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
38133819

38143820
#[allow(improper_ctypes)]
38153821
unsafe extern "C" {
3822+
#[link_name = "llvm.x86.avx2.pmadd.wd"]
3823+
fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
38163824
#[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
38173825
fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
38183826
#[link_name = "llvm.x86.avx2.mpsadbw"]
@@ -4661,7 +4669,7 @@ mod tests {
46614669
}
46624670

46634671
#[simd_test(enable = "avx2")]
4664-
const fn test_mm256_madd_epi16() {
4672+
fn test_mm256_madd_epi16() {
46654673
let a = _mm256_set1_epi16(2);
46664674
let b = _mm256_set1_epi16(4);
46674675
let r = _mm256_madd_epi16(a, b);
@@ -4671,12 +4679,22 @@ mod tests {
46714679

46724680
#[target_feature(enable = "avx2")]
46734681
#[cfg_attr(test, assert_instr(vpmaddwd))]
4674-
unsafe fn test_mm256_madd_epi16_mul_one(mad: __m256i) -> __m256i {
4682+
unsafe fn test_mm256_madd_epi16_mul_one(v: __m256i) -> __m256i {
46754683
// This is a trick used in the adler32 algorithm to get a widening addition. The
46764684
// multiplication by 1 is trivial, but must not be optimized out because then the vpmaddwd
46774685
// instruction is no longer selected. The assert_instr verifies that this is the case.
46784686
let one_v = _mm256_set1_epi16(1);
4679-
_mm256_madd_epi16(mad, one_v)
4687+
_mm256_madd_epi16(v, one_v)
4688+
}
4689+
4690+
#[target_feature(enable = "avx2")]
4691+
#[cfg_attr(test, assert_instr(vpmaddwd))]
4692+
unsafe fn test_mm256_madd_epi16_shl(v: __m256i) -> __m256i {
4693+
// This is a trick used in the base64 algorithm to get a widening addition. Instead of a
4694+
// multiplication, a vector shl is used. In LLVM 22 that breaks the pattern recognition
4695+
// for the automatic optimization to vpmaddwd.
4696+
let shift_value = _mm256_set1_epi32(12i32);
4697+
_mm256_madd_epi16(v, shift_value)
46804698
}
46814699

46824700
#[simd_test(enable = "avx2")]

crates/core_arch/src/x86/avx512bw.rs

Lines changed: 29 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6321,22 +6321,20 @@ pub const unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a:
63216321
#[target_feature(enable = "avx512bw")]
63226322
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63236323
#[cfg_attr(test, assert_instr(vpmaddwd))]
6324-
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6325-
pub const fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
6326-
unsafe {
6327-
let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
6328-
let even: i32x16 = simd_shuffle!(
6329-
r,
6330-
r,
6331-
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
6332-
);
6333-
let odd: i32x16 = simd_shuffle!(
6334-
r,
6335-
r,
6336-
[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
6337-
);
6338-
simd_add(even, odd).as_m512i()
6339-
}
6324+
pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
6325+
// It's a trick used in the Adler-32 algorithm to perform a widening addition.
6326+
//
6327+
// ```rust
6328+
// #[target_feature(enable = "avx512bw")]
6329+
// unsafe fn widening_add(mad: __m512i) -> __m512i {
6330+
// _mm512_madd_epi16(mad, _mm512_set1_epi16(1))
6331+
// }
6332+
// ```
6333+
//
6334+
// If we implement this using generic vector intrinsics, the optimizer
6335+
// will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
6336+
// For this reason, we use x86 intrinsics.
6337+
unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
63406338
}
63416339

63426340
/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6346,8 +6344,7 @@ pub const fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
63466344
#[target_feature(enable = "avx512bw")]
63476345
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63486346
#[cfg_attr(test, assert_instr(vpmaddwd))]
6349-
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6350-
pub const fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
6347+
pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
63516348
unsafe {
63526349
let madd = _mm512_madd_epi16(a, b).as_i32x16();
63536350
transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
@@ -6361,8 +6358,7 @@ pub const fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: _
63616358
#[target_feature(enable = "avx512bw")]
63626359
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63636360
#[cfg_attr(test, assert_instr(vpmaddwd))]
6364-
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6365-
pub const fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
6361+
pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
63666362
unsafe {
63676363
let madd = _mm512_madd_epi16(a, b).as_i32x16();
63686364
transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
@@ -6376,8 +6372,7 @@ pub const fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __
63766372
#[target_feature(enable = "avx512bw,avx512vl")]
63776373
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63786374
#[cfg_attr(test, assert_instr(vpmaddwd))]
6379-
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6380-
pub const fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
6375+
pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
63816376
unsafe {
63826377
let madd = _mm256_madd_epi16(a, b).as_i32x8();
63836378
transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
@@ -6391,8 +6386,7 @@ pub const fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __
63916386
#[target_feature(enable = "avx512bw,avx512vl")]
63926387
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63936388
#[cfg_attr(test, assert_instr(vpmaddwd))]
6394-
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6395-
pub const fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
6389+
pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
63966390
unsafe {
63976391
let madd = _mm256_madd_epi16(a, b).as_i32x8();
63986392
transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
@@ -6406,8 +6400,7 @@ pub const fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m
64066400
#[target_feature(enable = "avx512bw,avx512vl")]
64076401
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
64086402
#[cfg_attr(test, assert_instr(vpmaddwd))]
6409-
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6410-
pub const fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
6403+
pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
64116404
unsafe {
64126405
let madd = _mm_madd_epi16(a, b).as_i32x4();
64136406
transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
@@ -6421,8 +6414,7 @@ pub const fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m12
64216414
#[target_feature(enable = "avx512bw,avx512vl")]
64226415
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
64236416
#[cfg_attr(test, assert_instr(vpmaddwd))]
6424-
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6425-
pub const fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
6417+
pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
64266418
unsafe {
64276419
let madd = _mm_madd_epi16(a, b).as_i32x4();
64286420
transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
@@ -12582,6 +12574,8 @@ unsafe extern "C" {
1258212574
#[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
1258312575
fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
1258412576

12577+
#[link_name = "llvm.x86.avx512.pmaddw.d.512"]
12578+
fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
1258512579
#[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
1258612580
fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32;
1258712581

@@ -17506,7 +17500,7 @@ mod tests {
1750617500
}
1750717501

1750817502
#[simd_test(enable = "avx512bw")]
17509-
const fn test_mm512_madd_epi16() {
17503+
fn test_mm512_madd_epi16() {
1751017504
let a = _mm512_set1_epi16(1);
1751117505
let b = _mm512_set1_epi16(1);
1751217506
let r = _mm512_madd_epi16(a, b);
@@ -17515,7 +17509,7 @@ mod tests {
1751517509
}
1751617510

1751717511
#[simd_test(enable = "avx512bw")]
17518-
const fn test_mm512_mask_madd_epi16() {
17512+
fn test_mm512_mask_madd_epi16() {
1751917513
let a = _mm512_set1_epi16(1);
1752017514
let b = _mm512_set1_epi16(1);
1752117515
let r = _mm512_mask_madd_epi16(a, 0, a, b);
@@ -17543,7 +17537,7 @@ mod tests {
1754317537
}
1754417538

1754517539
#[simd_test(enable = "avx512bw")]
17546-
const fn test_mm512_maskz_madd_epi16() {
17540+
fn test_mm512_maskz_madd_epi16() {
1754717541
let a = _mm512_set1_epi16(1);
1754817542
let b = _mm512_set1_epi16(1);
1754917543
let r = _mm512_maskz_madd_epi16(0, a, b);
@@ -17554,7 +17548,7 @@ mod tests {
1755417548
}
1755517549

1755617550
#[simd_test(enable = "avx512bw,avx512vl")]
17557-
const fn test_mm256_mask_madd_epi16() {
17551+
fn test_mm256_mask_madd_epi16() {
1755817552
let a = _mm256_set1_epi16(1);
1755917553
let b = _mm256_set1_epi16(1);
1756017554
let r = _mm256_mask_madd_epi16(a, 0, a, b);
@@ -17574,7 +17568,7 @@ mod tests {
1757417568
}
1757517569

1757617570
#[simd_test(enable = "avx512bw,avx512vl")]
17577-
const fn test_mm256_maskz_madd_epi16() {
17571+
fn test_mm256_maskz_madd_epi16() {
1757817572
let a = _mm256_set1_epi16(1);
1757917573
let b = _mm256_set1_epi16(1);
1758017574
let r = _mm256_maskz_madd_epi16(0, a, b);
@@ -17585,7 +17579,7 @@ mod tests {
1758517579
}
1758617580

1758717581
#[simd_test(enable = "avx512bw,avx512vl")]
17588-
const fn test_mm_mask_madd_epi16() {
17582+
fn test_mm_mask_madd_epi16() {
1758917583
let a = _mm_set1_epi16(1);
1759017584
let b = _mm_set1_epi16(1);
1759117585
let r = _mm_mask_madd_epi16(a, 0, a, b);
@@ -17596,7 +17590,7 @@ mod tests {
1759617590
}
1759717591

1759817592
#[simd_test(enable = "avx512bw,avx512vl")]
17599-
const fn test_mm_maskz_madd_epi16() {
17593+
fn test_mm_maskz_madd_epi16() {
1760017594
let a = _mm_set1_epi16(1);
1760117595
let b = _mm_set1_epi16(1);
1760217596
let r = _mm_maskz_madd_epi16(0, a, b);

crates/core_arch/src/x86/sse2.rs

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -210,14 +210,20 @@ pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
210210
#[target_feature(enable = "sse2")]
211211
#[cfg_attr(test, assert_instr(pmaddwd))]
212212
#[stable(feature = "simd_x86", since = "1.27.0")]
213-
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
214-
pub const fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
215-
unsafe {
216-
let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
217-
let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
218-
let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
219-
simd_add(even, odd).as_m128i()
220-
}
213+
pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
214+
// It's a trick used in the Adler-32 algorithm to perform a widening addition.
215+
//
216+
// ```rust
217+
// #[target_feature(enable = "sse2")]
218+
// unsafe fn widening_add(mad: __m128i) -> __m128i {
219+
// _mm_madd_epi16(mad, _mm_set1_epi16(1))
220+
// }
221+
// ```
222+
//
223+
// If we implement this using generic vector intrinsics, the optimizer
224+
// will eliminate this pattern, and `pmaddwd` will no longer be emitted.
225+
// For this reason, we use x86 intrinsics.
226+
unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
221227
}
222228

223229
/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -3187,6 +3193,8 @@ unsafe extern "C" {
31873193
fn lfence();
31883194
#[link_name = "llvm.x86.sse2.mfence"]
31893195
fn mfence();
3196+
#[link_name = "llvm.x86.sse2.pmadd.wd"]
3197+
fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
31903198
#[link_name = "llvm.x86.sse2.psad.bw"]
31913199
fn psadbw(a: u8x16, b: u8x16) -> u64x2;
31923200
#[link_name = "llvm.x86.sse2.psll.w"]
@@ -3465,7 +3473,7 @@ mod tests {
34653473
}
34663474

34673475
#[simd_test(enable = "sse2")]
3468-
const fn test_mm_madd_epi16() {
3476+
fn test_mm_madd_epi16() {
34693477
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
34703478
let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
34713479
let r = _mm_madd_epi16(a, b);

0 commit comments

Comments
 (0)