Auto merge of #150848 - cuviper:beta-next, r=cuviper

rust-bors[bot] · web-flow · commit 3d95eb9be7d7 · 2026-01-09T03:03:33.000Z
[beta] backports - Revert "Rollup merge of #149147 - chenyukang:yukang-fix-unused_assignments-macro-gen-147648, r=JonathanBrouwer" #149657 - Don't lint on interior mutable `const` item coming from derefs #150166 - stdarch subtree update #150639 (partial) r? cuviper
diff --git a/compiler/rustc_lint/src/interior_mutable_consts.rs b/compiler/rustc_lint/src/interior_mutable_consts.rs
@@ -1,6 +1,7 @@
 use rustc_hir::attrs::AttributeKind;
 use rustc_hir::def::{DefKind, Res};
 use rustc_hir::{Expr, ExprKind, ItemKind, Node, find_attr};
+use rustc_middle::ty::adjustment::Adjust;
 use rustc_session::{declare_lint, declare_lint_pass};
 
 use crate::lints::{ConstItemInteriorMutationsDiag, ConstItemInteriorMutationsSuggestionStatic};
@@ -77,6 +78,13 @@ impl<'tcx> LateLintPass<'tcx> for InteriorMutableConsts {
         if let ExprKind::Path(qpath) = &receiver.kind
             && let Res::Def(DefKind::Const | DefKind::AssocConst, const_did) =
                 typeck.qpath_res(qpath, receiver.hir_id)
+            // Don't consider derefs as those can do arbitrary things
+            // like using thread local (see rust-lang/rust#150157)
+            && !cx
+                .typeck_results()
+                .expr_adjustments(receiver)
+                .into_iter()
+                .any(|adj| matches!(adj.kind, Adjust::Deref(_)))
             // Let's do the attribute check after the other checks for perf reasons
             && find_attr!(
                 cx.tcx.get_all_attrs(method_did),
diff --git a/compiler/rustc_mir_transform/src/liveness.rs b/compiler/rustc_mir_transform/src/liveness.rs
@@ -75,11 +75,6 @@ pub(crate) fn check_liveness<'tcx>(tcx: TyCtxt<'tcx>, def_id: LocalDefId) -> Den
         return DenseBitSet::new_empty(0);
     }
 
-    // Don't run unused pass for items generated by foreign macros
-    if tcx.def_span(parent).in_external_macro(tcx.sess.source_map()) {
-        return DenseBitSet::new_empty(0);
-    }
-
     let mut body = &*tcx.mir_promoted(def_id).0.borrow();
     let mut body_mem;
 
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -1754,12 +1754,19 @@ pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe {
-        let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
-        let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
-        let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
-        simd_add(even, odd).as_m256i()
-    }
+    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
+    //
+    // ```rust
+    // #[target_feature(enable = "avx2")]
+    // unsafe fn widening_add(mad: __m256i) -> __m256i {
+    //     _mm256_madd_epi16(mad, _mm256_set1_epi16(1))
+    // }
+    // ```
+    //
+    // If we implement this using generic vector intrinsics, the optimizer
+    // will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
+    // For this reason, we use x86 intrinsics.
+    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -3701,6 +3708,8 @@ unsafe extern "C" {
     fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
     #[link_name = "llvm.x86.avx2.phsub.sw"]
     fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
+    #[link_name = "llvm.x86.avx2.pmadd.wd"]
+    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
     #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
     fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
     #[link_name = "llvm.x86.avx2.mpsadbw"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -5847,20 +5847,19 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe {
-        let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
-        let even: i32x16 = simd_shuffle!(
-            r,
-            r,
-            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
-        );
-        let odd: i32x16 = simd_shuffle!(
-            r,
-            r,
-            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
-        );
-        simd_add(even, odd).as_m512i()
-    }
+    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
+    //
+    // ```rust
+    // #[target_feature(enable = "avx512bw")]
+    // unsafe fn widening_add(mad: __m512i) -> __m512i {
+    //     _mm512_madd_epi16(mad, _mm512_set1_epi16(1))
+    // }
+    // ```
+    //
+    // If we implement this using generic vector intrinsics, the optimizer
+    // will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
+    // For this reason, we use x86 intrinsics.
+    unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11687,6 +11686,8 @@ unsafe extern "C" {
     #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
     fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
 
+    #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
+    fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
     #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
     fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32;
 
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -201,12 +201,19 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmaddwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe {
-        let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
-        let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
-        let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
-        simd_add(even, odd).as_m128i()
-    }
+    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
+    //
+    // ```rust
+    // #[target_feature(enable = "sse2")]
+    // unsafe fn widening_add(mad: __m128i) -> __m128i {
+    //     _mm_madd_epi16(mad, _mm_set1_epi16(1))
+    // }
+    // ```
+    //
+    // If we implement this using generic vector intrinsics, the optimizer
+    // will eliminate this pattern, and `pmaddwd` will no longer be emitted.
+    // For this reason, we use x86 intrinsics.
+    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -3054,6 +3061,8 @@ unsafe extern "C" {
     fn lfence();
     #[link_name = "llvm.x86.sse2.mfence"]
     fn mfence();
+    #[link_name = "llvm.x86.sse2.pmadd.wd"]
+    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
     #[link_name = "llvm.x86.sse2.psad.bw"]
     fn psadbw(a: u8x16, b: u8x16) -> u64x2;
     #[link_name = "llvm.x86.sse2.psll.w"]
diff --git a/src/tools/miri/src/shims/x86/avx2.rs b/src/tools/miri/src/shims/x86/avx2.rs
@@ -245,6 +245,42 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
 
                 shift_simd_by_scalar(this, left, right, which, dest)?;
             }
+            // Used to implement the _mm256_madd_epi16 function.
+            // Multiplies packed signed 16-bit integers in `left` and `right`, producing
+            // intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+            // intermediate 32-bit integers, and pack the results in `dest`.
+            "pmadd.wd" => {
+                let [left, right] =
+                    this.check_shim_sig_lenient(abi, CanonAbi::C, link_name, args)?;
+
+                let (left, left_len) = this.project_to_simd(left)?;
+                let (right, right_len) = this.project_to_simd(right)?;
+                let (dest, dest_len) = this.project_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(dest_len.strict_mul(2), left_len);
+
+                for i in 0..dest_len {
+                    let j1 = i.strict_mul(2);
+                    let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_i16()?;
+                    let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i16()?;
+
+                    let j2 = j1.strict_add(1);
+                    let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_i16()?;
+                    let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i16()?;
+
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Multiplications are i16*i16->i32, which will not overflow.
+                    let mul1 = i32::from(left1).strict_mul(right1.into());
+                    let mul2 = i32::from(left2).strict_mul(right2.into());
+                    // However, this addition can overflow in the most extreme case
+                    // (-0x8000)*(-0x8000)+(-0x8000)*(-0x8000) = 0x80000000
+                    let res = mul1.wrapping_add(mul2);
+
+                    this.write_scalar(Scalar::from_i32(res), &dest)?;
+                }
+            }
             _ => return interp_ok(EmulateItemResult::NotSupported),
         }
         interp_ok(EmulateItemResult::NeedsReturn)
diff --git a/src/tools/miri/src/shims/x86/sse2.rs b/src/tools/miri/src/shims/x86/sse2.rs
@@ -278,6 +278,42 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                     this.copy_op(&this.project_index(&left, i)?, &this.project_index(&dest, i)?)?;
                 }
             }
+            // Used to implement the _mm_madd_epi16 function.
+            // Multiplies packed signed 16-bit integers in `left` and `right`, producing
+            // intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+            // intermediate 32-bit integers, and pack the results in `dest`.
+            "pmadd.wd" => {
+                let [left, right] =
+                    this.check_shim_sig_lenient(abi, CanonAbi::C, link_name, args)?;
+
+                let (left, left_len) = this.project_to_simd(left)?;
+                let (right, right_len) = this.project_to_simd(right)?;
+                let (dest, dest_len) = this.project_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(dest_len.strict_mul(2), left_len);
+
+                for i in 0..dest_len {
+                    let j1 = i.strict_mul(2);
+                    let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_i16()?;
+                    let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i16()?;
+
+                    let j2 = j1.strict_add(1);
+                    let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_i16()?;
+                    let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i16()?;
+
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Multiplications are i16*i16->i32, which will not overflow.
+                    let mul1 = i32::from(left1).strict_mul(right1.into());
+                    let mul2 = i32::from(left2).strict_mul(right2.into());
+                    // However, this addition can overflow in the most extreme case
+                    // (-0x8000)*(-0x8000)+(-0x8000)*(-0x8000) = 0x80000000
+                    let res = mul1.wrapping_add(mul2);
+
+                    this.write_scalar(Scalar::from_i32(res), &dest)?;
+                }
+            }
             _ => return interp_ok(EmulateItemResult::NotSupported),
         }
         interp_ok(EmulateItemResult::NeedsReturn)
diff --git a/tests/ui/lint/const-item-interior-mutations-const-deref.rs b/tests/ui/lint/const-item-interior-mutations-const-deref.rs
@@ -0,0 +1,28 @@
+// Regression test for <https://github.com/rust-lang/rust/issues/150157>
+//
+// We shouldn't lint on user types, including through deref.
+
+//@ check-pass
+
+use std::cell::Cell;
+use std::ops::Deref;
+
+// Cut down version of the issue reproducer without the thread local to just a Deref
+pub struct LocalKey<T> {
+    inner: T,
+}
+
+impl<T> Deref for LocalKey<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
+const LOCAL_COUNT: LocalKey<Cell<usize>> = LocalKey { inner: Cell::new(8) };
+
+fn main() {
+    let count = LOCAL_COUNT.get();
+    LOCAL_COUNT.set(count);
+}
diff --git a/tests/ui/liveness/auxiliary/aux_issue_147648.rs b/tests/ui/liveness/auxiliary/aux_issue_147648.rs
diff --git a/tests/ui/liveness/unused-assignments-from-macro-147648.rs b/tests/ui/liveness/unused-assignments-from-macro-147648.rs