PPU/SPU LLVM: Use arm shuffles in recompilers instead of emulating x86 pshufb

Whatcookie · Whatcookie · commit 0bbae687f9e0 · 2026-04-28T17:24:24.000-04:00
&gt; - SHUFB from 9 instructions down to 5
&gt; - Though it should be 4 if LLVM would just emit BCAX...
diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h
@@ -4030,6 +4030,109 @@ template <typename T1, typename T2>
 		});
 	}
 
+#ifdef ARCH_ARM64
+	template <typename T1, typename T2>
+	value_t<u8[16]> tbl(T1 a, T2 b)
+	{
+ 		value_t<u8[16]> result;
+		const auto data0 = a.eval(m_ir);
+		const auto index = b.eval(m_ir);
+		const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
+
+		if (auto c = llvm::dyn_cast<llvm::Constant>(index))
+		{
+			v128 mask{};
+			const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
+
+			if (cv)
+			{
+				for (u32 i = 0; i < 16; i++)
+				{
+					const u64 b_val = cv->getElementAsInteger(i);
+					mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
+				}
+			}
+		
+
+			if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
+			{
+				result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
+				result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
+				result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
+				return result;
+			}
+		}
+
+	result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
+	return result;
+	}
+
+	template <typename T1, typename T2, typename T3>
+	value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
+	{
+		value_t<u8[16]> result;
+		const auto data0 = a.eval(m_ir);
+		const auto data1 = b.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		if (auto c = llvm::dyn_cast<llvm::Constant>(index))
+		{
+			v128 mask{};
+			v128 bitmask{};
+			const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
+
+			if (cv)
+			{
+				for (u32 i = 0; i < 16; i++)
+				{
+					const u64 b_val = cv->getElementAsInteger(i);
+					mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
+					bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
+				}
+			}
+
+			if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
+			{
+				auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
+				auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
+				auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
+
+				auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
+				result.value = m_ir->CreateAnd(lookup, z_mask);
+				return result;
+			}
+		}
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3>
+	value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
+	{
+		value_t<u8[16]> result;
+		const auto v_fallback = fallback.eval(m_ir);
+		const auto data0 = a.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3, typename T4>
+	value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
+	{
+		value_t<u8[16]> result;
+		const auto v_fallback = fallback.eval(m_ir);
+		const auto data0 = a.eval(m_ir);
+		const auto data1 = b.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
+		return result;
+	}
+#endif
+
 	// (m << 3) >= 0 ? a : b
 	template <typename T, typename U, typename V>
 	static auto select_by_bit4(T&& m, U&& a, V&& b)
diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp
@@ -1642,6 +1642,18 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
 {
 	const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);
 
+#ifdef ARCH_ARM64
+
+	if (op.ra == op.rb)
+	{
+		set_vr(op.vd, tbl(a, (~c & 0xf)));
+		return;
+	}
+
+	set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));
+}
+#else
+
 	if (op.ra == op.rb)
 	{
 		set_vr(op.vd, pshufb(a, ~c & 0xf));
@@ -1658,6 +1670,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
 	const auto i = eval(~c & 0x1f);
 	set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
 }
+#endif
 
 void PPUTranslator::VPKPX(ppu_opcode_t op)
 {
diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@@ -6745,6 +6745,77 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		const auto a = get_vr<u8[16]>(op.ra);
 		const auto b = get_vr<u8[16]>(op.rb);
 
+#ifdef ARCH_ARM64
+
+		if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
+		{
+			if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
+			{
+				if (op.ra == op.rb)
+				{
+					if (perm_only)
+					{
+						const auto cm = eval(c & 0x0f);
+						set_vr(op.rt4, tbl(as, cm));
+						return;
+					}
+				
+					const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));                        
+					const auto cm = eval(c & 0x8f);                                                                                                               
+ 					set_vr(op.rt4, tbx(x, as, cm));
+					return;
+				}
+
+				if (perm_only)
+				{
+					const auto cm = eval(c & 0x1f);
+					set_vr(op.rt4, tbl2(as, bs, cm));
+					return;
+				}
+
+				const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+				const auto cm = eval(c & 0x9f);
+				set_vr(op.rt4, tbx2(x, as, bs, cm));
+				return;
+			}
+
+		}
+
+
+		if (op.ra == op.rb && !m_interp_magn)
+		{
+			if (perm_only)
+			{
+				const auto cm = eval(c & 0x0f);
+				const auto cr = eval(cm ^ 0x0f);
+				set_vr(op.rt4, tbl(a, cr));
+				return;
+			}
+
+			const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+			const auto cm = eval(c & 0x8f);
+			const auto cr = eval(cm ^ 0x0f);
+			set_vr(op.rt4, tbx(x, a, cr));
+			return;
+		}
+
+		if (perm_only)
+		{
+			const auto cm = eval(c & 0x9f);
+			const auto cr = eval(cm ^ 0x0f);
+			set_vr(op.rt4, tbl2(a, b, cr));
+			return;
+		}
+
+		const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); 
+		// AND should be before XOR so that llvm can combine them into BCAX
+		// Though for some reason it doesn't seem to be doing that.
+		const auto cm = eval(c & ~0x60);
+		const auto cr = eval(cm ^ 0x0f);
+		set_vr(op.rt4, tbx2(x, a, b, cr));
+	}                                                                                              
+#else
+
 		// Data with swapped endian from a load instruction
 		if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
 		{
@@ -6889,6 +6960,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		else
 			set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
 	}
+#endif
 
 	void MPYA(spu_opcode_t op)
 	{

Original file line number	Diff line number	Diff line change
`@@ -1642,6 +1642,18 @@ void PPUTranslator::VPERM(ppu_opcode_t op)`
`1642`	`1642`	`{`
`1643`	`1643`	`const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);`
`1644`	`1644`
	`1645`	`+#ifdef ARCH_ARM64`
	`1646`	`+`
	`1647`	`+ if (op.ra == op.rb)`
	`1648`	`+ {`
	`1649`	`+ set_vr(op.vd, tbl(a, (~c & 0xf)));`
	`1650`	`+ return;`
	`1651`	`+ }`
	`1652`	`+`
	`1653`	`+ set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));`
	`1654`	`+}`
	`1655`	`+#else`
	`1656`	`+`
`1645`	`1657`	`if (op.ra == op.rb)`
`1646`	`1658`	`{`
`1647`	`1659`	`set_vr(op.vd, pshufb(a, ~c & 0xf));`
`@@ -1658,6 +1670,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)`
`1658`	`1670`	`const auto i = eval(~c & 0x1f);`
`1659`	`1671`	`set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));`
`1660`	`1672`	`}`
	`1673`	`+#endif`
`1661`	`1674`
`1662`	`1675`	`void PPUTranslator::VPKPX(ppu_opcode_t op)`
`1663`	`1676`	`{`