@@ -6321,22 +6321,20 @@ pub const unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a:
63216321#[target_feature(enable = "avx512bw")]
63226322#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63236323#[cfg_attr(test, assert_instr(vpmaddwd))]
6324- #[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6325- pub const fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
6326- unsafe {
6327- let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
6328- let even: i32x16 = simd_shuffle!(
6329- r,
6330- r,
6331- [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
6332- );
6333- let odd: i32x16 = simd_shuffle!(
6334- r,
6335- r,
6336- [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
6337- );
6338- simd_add(even, odd).as_m512i()
6339- }
6324+ pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
6325+ // It's a trick used in the Adler-32 algorithm to perform a widening addition.
6326+ //
6327+ // ```rust
6328+ // #[target_feature(enable = "avx512bw")]
6329+ // unsafe fn widening_add(mad: __m512i) -> __m512i {
6330+ // _mm512_madd_epi16(mad, _mm512_set1_epi16(1))
6331+ // }
6332+ // ```
6333+ //
6334+ // If we implement this using generic vector intrinsics, the optimizer
6335+ // will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
6336+ // For this reason, we use x86 intrinsics.
6337+ unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
63406338}
63416339
63426340/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6346,8 +6344,7 @@ pub const fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
63466344#[target_feature(enable = "avx512bw")]
63476345#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63486346#[cfg_attr(test, assert_instr(vpmaddwd))]
6349- #[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6350- pub const fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
6347+ pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
63516348 unsafe {
63526349 let madd = _mm512_madd_epi16(a, b).as_i32x16();
63536350 transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
@@ -6361,8 +6358,7 @@ pub const fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: _
63616358#[target_feature(enable = "avx512bw")]
63626359#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63636360#[cfg_attr(test, assert_instr(vpmaddwd))]
6364- #[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6365- pub const fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
6361+ pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
63666362 unsafe {
63676363 let madd = _mm512_madd_epi16(a, b).as_i32x16();
63686364 transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
@@ -6376,8 +6372,7 @@ pub const fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __
63766372#[target_feature(enable = "avx512bw,avx512vl")]
63776373#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63786374#[cfg_attr(test, assert_instr(vpmaddwd))]
6379- #[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6380- pub const fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
6375+ pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
63816376 unsafe {
63826377 let madd = _mm256_madd_epi16(a, b).as_i32x8();
63836378 transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
@@ -6391,8 +6386,7 @@ pub const fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __
63916386#[target_feature(enable = "avx512bw,avx512vl")]
63926387#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63936388#[cfg_attr(test, assert_instr(vpmaddwd))]
6394- #[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6395- pub const fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
6389+ pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
63966390 unsafe {
63976391 let madd = _mm256_madd_epi16(a, b).as_i32x8();
63986392 transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
@@ -6406,8 +6400,7 @@ pub const fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m
64066400#[target_feature(enable = "avx512bw,avx512vl")]
64076401#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
64086402#[cfg_attr(test, assert_instr(vpmaddwd))]
6409- #[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6410- pub const fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
6403+ pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
64116404 unsafe {
64126405 let madd = _mm_madd_epi16(a, b).as_i32x4();
64136406 transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
@@ -6421,8 +6414,7 @@ pub const fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m12
64216414#[target_feature(enable = "avx512bw,avx512vl")]
64226415#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
64236416#[cfg_attr(test, assert_instr(vpmaddwd))]
6424- #[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6425- pub const fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
6417+ pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
64266418 unsafe {
64276419 let madd = _mm_madd_epi16(a, b).as_i32x4();
64286420 transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
@@ -12582,6 +12574,8 @@ unsafe extern "C" {
1258212574 #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
1258312575 fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
1258412576
12577+ #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
12578+ fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
1258512579 #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
1258612580 fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32;
1258712581
@@ -17506,7 +17500,7 @@ mod tests {
1750617500 }
1750717501
1750817502 #[simd_test(enable = "avx512bw")]
17509- const fn test_mm512_madd_epi16() {
17503+ fn test_mm512_madd_epi16() {
1751017504 let a = _mm512_set1_epi16(1);
1751117505 let b = _mm512_set1_epi16(1);
1751217506 let r = _mm512_madd_epi16(a, b);
@@ -17515,7 +17509,7 @@ mod tests {
1751517509 }
1751617510
1751717511 #[simd_test(enable = "avx512bw")]
17518- const fn test_mm512_mask_madd_epi16() {
17512+ fn test_mm512_mask_madd_epi16() {
1751917513 let a = _mm512_set1_epi16(1);
1752017514 let b = _mm512_set1_epi16(1);
1752117515 let r = _mm512_mask_madd_epi16(a, 0, a, b);
@@ -17543,7 +17537,7 @@ mod tests {
1754317537 }
1754417538
1754517539 #[simd_test(enable = "avx512bw")]
17546- const fn test_mm512_maskz_madd_epi16() {
17540+ fn test_mm512_maskz_madd_epi16() {
1754717541 let a = _mm512_set1_epi16(1);
1754817542 let b = _mm512_set1_epi16(1);
1754917543 let r = _mm512_maskz_madd_epi16(0, a, b);
@@ -17554,7 +17548,7 @@ mod tests {
1755417548 }
1755517549
1755617550 #[simd_test(enable = "avx512bw,avx512vl")]
17557- const fn test_mm256_mask_madd_epi16() {
17551+ fn test_mm256_mask_madd_epi16() {
1755817552 let a = _mm256_set1_epi16(1);
1755917553 let b = _mm256_set1_epi16(1);
1756017554 let r = _mm256_mask_madd_epi16(a, 0, a, b);
@@ -17574,7 +17568,7 @@ mod tests {
1757417568 }
1757517569
1757617570 #[simd_test(enable = "avx512bw,avx512vl")]
17577- const fn test_mm256_maskz_madd_epi16() {
17571+ fn test_mm256_maskz_madd_epi16() {
1757817572 let a = _mm256_set1_epi16(1);
1757917573 let b = _mm256_set1_epi16(1);
1758017574 let r = _mm256_maskz_madd_epi16(0, a, b);
@@ -17585,7 +17579,7 @@ mod tests {
1758517579 }
1758617580
1758717581 #[simd_test(enable = "avx512bw,avx512vl")]
17588- const fn test_mm_mask_madd_epi16() {
17582+ fn test_mm_mask_madd_epi16() {
1758917583 let a = _mm_set1_epi16(1);
1759017584 let b = _mm_set1_epi16(1);
1759117585 let r = _mm_mask_madd_epi16(a, 0, a, b);
@@ -17596,7 +17590,7 @@ mod tests {
1759617590 }
1759717591
1759817592 #[simd_test(enable = "avx512bw,avx512vl")]
17599- const fn test_mm_maskz_madd_epi16() {
17593+ fn test_mm_maskz_madd_epi16() {
1760017594 let a = _mm_set1_epi16(1);
1760117595 let b = _mm_set1_epi16(1);
1760217596 let r = _mm_maskz_madd_epi16(0, a, b);
0 commit comments