Skip to content

Commit 10874ac

Browse files
committed
PPU/SPU LLVM: Use arm shuffles in recompilers instead of emulating x86 pshufb
> - SHUFB from 9 instructions down to 5 > - Though it should be 4 if LLVM would just emit BCAX...
1 parent d584575 commit 10874ac

3 files changed

Lines changed: 186 additions & 0 deletions

File tree

rpcs3/Emu/CPU/CPUTranslator.h

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3927,6 +3927,109 @@ class cpu_translator
39273927
});
39283928
}
39293929

3930+
#ifdef ARCH_ARM64
3931+
template <typename T1, typename T2>
3932+
value_t<u8[16]> tbl(T1 a, T2 b)
3933+
{
3934+
value_t<u8[16]> result;
3935+
const auto data0 = a.eval(m_ir);
3936+
const auto index = b.eval(m_ir);
3937+
const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
3938+
3939+
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
3940+
{
3941+
v128 mask{};
3942+
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
3943+
3944+
if (cv)
3945+
{
3946+
for (u32 i = 0; i < 16; i++)
3947+
{
3948+
const u64 b_val = cv->getElementAsInteger(i);
3949+
mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
3950+
}
3951+
}
3952+
3953+
3954+
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
3955+
{
3956+
result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
3957+
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
3958+
result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
3959+
return result;
3960+
}
3961+
}
3962+
3963+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
3964+
return result;
3965+
}
3966+
3967+
template <typename T1, typename T2, typename T3>
3968+
value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
3969+
{
3970+
value_t<u8[16]> result;
3971+
const auto data0 = a.eval(m_ir);
3972+
const auto data1 = b.eval(m_ir);
3973+
const auto index = indices.eval(m_ir);
3974+
3975+
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
3976+
{
3977+
v128 mask{};
3978+
v128 bitmask{};
3979+
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
3980+
3981+
if (cv)
3982+
{
3983+
for (u32 i = 0; i < 16; i++)
3984+
{
3985+
const u64 b_val = cv->getElementAsInteger(i);
3986+
mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
3987+
bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
3988+
}
3989+
}
3990+
3991+
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
3992+
{
3993+
auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
3994+
auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
3995+
auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
3996+
3997+
auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
3998+
result.value = m_ir->CreateAnd(lookup, z_mask);
3999+
return result;
4000+
}
4001+
}
4002+
4003+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
4004+
return result;
4005+
}
4006+
4007+
template <typename T1, typename T2, typename T3>
4008+
value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
4009+
{
4010+
value_t<u8[16]> result;
4011+
const auto v_fallback = fallback.eval(m_ir);
4012+
const auto data0 = a.eval(m_ir);
4013+
const auto index = indices.eval(m_ir);
4014+
4015+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
4016+
return result;
4017+
}
4018+
4019+
template <typename T1, typename T2, typename T3, typename T4>
4020+
value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
4021+
{
4022+
value_t<u8[16]> result;
4023+
const auto v_fallback = fallback.eval(m_ir);
4024+
const auto data0 = a.eval(m_ir);
4025+
const auto data1 = b.eval(m_ir);
4026+
const auto index = indices.eval(m_ir);
4027+
4028+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
4029+
return result;
4030+
}
4031+
#endif
4032+
39304033
// (m << 3) >= 0 ? a : b
39314034
template <typename T, typename U, typename V>
39324035
static auto select_by_bit4(T&& m, U&& a, V&& b)

rpcs3/Emu/Cell/PPUTranslator.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1633,6 +1633,17 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
16331633
{
16341634
const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);
16351635

1636+
#ifdef ARCH_ARM64
1637+
1638+
if (op.ra == op.rb)
1639+
{
1640+
set_vr(op.vd, tbl(a, (~c & 0xf)));
1641+
return;
1642+
}
1643+
1644+
set_vr(op.vd, tbl2(a, b, (~c & 0xf)));
1645+
#endif
1646+
16361647
if (op.ra == op.rb)
16371648
{
16381649
set_vr(op.vd, pshufb(a, ~c & 0xf));

rpcs3/Emu/Cell/SPULLVMRecompiler.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5918,6 +5918,77 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
59185918
const auto a = get_vr<u8[16]>(op.ra);
59195919
const auto b = get_vr<u8[16]>(op.rb);
59205920

5921+
#ifdef ARCH_ARM64
5922+
5923+
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
5924+
{
5925+
if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
5926+
{
5927+
if (op.ra == op.rb)
5928+
{
5929+
if (perm_only)
5930+
{
5931+
const auto cm = eval(c & 0x0f);
5932+
set_vr(op.rt4, tbl(as, cm));
5933+
return;
5934+
}
5935+
5936+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
5937+
const auto cm = eval(c & 0x8f);
5938+
set_vr(op.rt4, tbx(x, as, cm));
5939+
return;
5940+
}
5941+
5942+
if (perm_only)
5943+
{
5944+
const auto cm = eval(c & 0x1f);
5945+
set_vr(op.rt4, tbl2(as, bs, cm));
5946+
return;
5947+
}
5948+
5949+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
5950+
const auto cm = eval(c & 0x9f);
5951+
set_vr(op.rt4, tbx2(x, as, bs, cm));
5952+
return;
5953+
}
5954+
5955+
}
5956+
5957+
5958+
if (op.ra == op.rb && !m_interp_magn)
5959+
{
5960+
if (perm_only)
5961+
{
5962+
const auto cm = eval(c & 0x0f);
5963+
const auto cr = eval(cm ^ 0x0f);
5964+
set_vr(op.rt4, tbl(a, cr));
5965+
return;
5966+
}
5967+
5968+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
5969+
const auto cm = eval(c & 0x8f);
5970+
const auto cr = eval(cm ^ 0x0f);
5971+
set_vr(op.rt4, tbx(x, a, cr));
5972+
return;
5973+
}
5974+
5975+
if (perm_only)
5976+
{
5977+
const auto cm = eval(c & 0x9f);
5978+
const auto cr = eval(cm ^ 0x0f);
5979+
set_vr(op.rt4, tbl2(a, b, cr));
5980+
return;
5981+
}
5982+
5983+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
5984+
// AND should be before XOR so that llvm can combine them into BCAX
5985+
// Though for some reason it doesn't seem to be doing that.
5986+
const auto cm = eval(c & ~0x60);
5987+
const auto cr = eval(cm ^ 0x0f);
5988+
set_vr(op.rt4, tbx2(x, a, b, cr));
5989+
}
5990+
#else
5991+
59215992
// Data with swapped endian from a load instruction
59225993
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
59235994
{
@@ -6062,6 +6133,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
60626133
else
60636134
set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
60646135
}
6136+
#endif
60656137

60666138
void MPYA(spu_opcode_t op)
60676139
{

0 commit comments

Comments
 (0)