Skip to content

Commit 0bbae68

Browse files
committed
PPU/SPU LLVM: Use arm shuffles in recompilers instead of emulating x86 pshufb
> - SHUFB from 9 instructions down to 5 > - Though it should be 4 if LLVM would just emit BCAX...
1 parent e05d359 commit 0bbae68

3 files changed

Lines changed: 188 additions & 0 deletions

File tree

rpcs3/Emu/CPU/CPUTranslator.h

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4030,6 +4030,109 @@ template <typename T1, typename T2>
40304030
});
40314031
}
40324032

4033+
#ifdef ARCH_ARM64
4034+
template <typename T1, typename T2>
4035+
value_t<u8[16]> tbl(T1 a, T2 b)
4036+
{
4037+
value_t<u8[16]> result;
4038+
const auto data0 = a.eval(m_ir);
4039+
const auto index = b.eval(m_ir);
4040+
const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
4041+
4042+
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
4043+
{
4044+
v128 mask{};
4045+
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
4046+
4047+
if (cv)
4048+
{
4049+
for (u32 i = 0; i < 16; i++)
4050+
{
4051+
const u64 b_val = cv->getElementAsInteger(i);
4052+
mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
4053+
}
4054+
}
4055+
4056+
4057+
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
4058+
{
4059+
result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
4060+
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
4061+
result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
4062+
return result;
4063+
}
4064+
}
4065+
4066+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
4067+
return result;
4068+
}
4069+
4070+
template <typename T1, typename T2, typename T3>
4071+
value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
4072+
{
4073+
value_t<u8[16]> result;
4074+
const auto data0 = a.eval(m_ir);
4075+
const auto data1 = b.eval(m_ir);
4076+
const auto index = indices.eval(m_ir);
4077+
4078+
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
4079+
{
4080+
v128 mask{};
4081+
v128 bitmask{};
4082+
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
4083+
4084+
if (cv)
4085+
{
4086+
for (u32 i = 0; i < 16; i++)
4087+
{
4088+
const u64 b_val = cv->getElementAsInteger(i);
4089+
mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
4090+
bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
4091+
}
4092+
}
4093+
4094+
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
4095+
{
4096+
auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
4097+
auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
4098+
auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
4099+
4100+
auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
4101+
result.value = m_ir->CreateAnd(lookup, z_mask);
4102+
return result;
4103+
}
4104+
}
4105+
4106+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
4107+
return result;
4108+
}
4109+
4110+
template <typename T1, typename T2, typename T3>
4111+
value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
4112+
{
4113+
value_t<u8[16]> result;
4114+
const auto v_fallback = fallback.eval(m_ir);
4115+
const auto data0 = a.eval(m_ir);
4116+
const auto index = indices.eval(m_ir);
4117+
4118+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
4119+
return result;
4120+
}
4121+
4122+
template <typename T1, typename T2, typename T3, typename T4>
4123+
value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
4124+
{
4125+
value_t<u8[16]> result;
4126+
const auto v_fallback = fallback.eval(m_ir);
4127+
const auto data0 = a.eval(m_ir);
4128+
const auto data1 = b.eval(m_ir);
4129+
const auto index = indices.eval(m_ir);
4130+
4131+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
4132+
return result;
4133+
}
4134+
#endif
4135+
40334136
// (m << 3) >= 0 ? a : b
40344137
template <typename T, typename U, typename V>
40354138
static auto select_by_bit4(T&& m, U&& a, V&& b)

rpcs3/Emu/Cell/PPUTranslator.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1642,6 +1642,18 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
16421642
{
16431643
const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);
16441644

1645+
#ifdef ARCH_ARM64
1646+
1647+
if (op.ra == op.rb)
1648+
{
1649+
set_vr(op.vd, tbl(a, (~c & 0xf)));
1650+
return;
1651+
}
1652+
1653+
set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));
1654+
}
1655+
#else
1656+
16451657
if (op.ra == op.rb)
16461658
{
16471659
set_vr(op.vd, pshufb(a, ~c & 0xf));
@@ -1658,6 +1670,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
16581670
const auto i = eval(~c & 0x1f);
16591671
set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
16601672
}
1673+
#endif
16611674

16621675
void PPUTranslator::VPKPX(ppu_opcode_t op)
16631676
{

rpcs3/Emu/Cell/SPULLVMRecompiler.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6745,6 +6745,77 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
67456745
const auto a = get_vr<u8[16]>(op.ra);
67466746
const auto b = get_vr<u8[16]>(op.rb);
67476747

6748+
#ifdef ARCH_ARM64
6749+
6750+
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
6751+
{
6752+
if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
6753+
{
6754+
if (op.ra == op.rb)
6755+
{
6756+
if (perm_only)
6757+
{
6758+
const auto cm = eval(c & 0x0f);
6759+
set_vr(op.rt4, tbl(as, cm));
6760+
return;
6761+
}
6762+
6763+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
6764+
const auto cm = eval(c & 0x8f);
6765+
set_vr(op.rt4, tbx(x, as, cm));
6766+
return;
6767+
}
6768+
6769+
if (perm_only)
6770+
{
6771+
const auto cm = eval(c & 0x1f);
6772+
set_vr(op.rt4, tbl2(as, bs, cm));
6773+
return;
6774+
}
6775+
6776+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
6777+
const auto cm = eval(c & 0x9f);
6778+
set_vr(op.rt4, tbx2(x, as, bs, cm));
6779+
return;
6780+
}
6781+
6782+
}
6783+
6784+
6785+
if (op.ra == op.rb && !m_interp_magn)
6786+
{
6787+
if (perm_only)
6788+
{
6789+
const auto cm = eval(c & 0x0f);
6790+
const auto cr = eval(cm ^ 0x0f);
6791+
set_vr(op.rt4, tbl(a, cr));
6792+
return;
6793+
}
6794+
6795+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
6796+
const auto cm = eval(c & 0x8f);
6797+
const auto cr = eval(cm ^ 0x0f);
6798+
set_vr(op.rt4, tbx(x, a, cr));
6799+
return;
6800+
}
6801+
6802+
if (perm_only)
6803+
{
6804+
const auto cm = eval(c & 0x9f);
6805+
const auto cr = eval(cm ^ 0x0f);
6806+
set_vr(op.rt4, tbl2(a, b, cr));
6807+
return;
6808+
}
6809+
6810+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
6811+
// AND should be before XOR so that llvm can combine them into BCAX
6812+
// Though for some reason it doesn't seem to be doing that.
6813+
const auto cm = eval(c & ~0x60);
6814+
const auto cr = eval(cm ^ 0x0f);
6815+
set_vr(op.rt4, tbx2(x, a, b, cr));
6816+
}
6817+
#else
6818+
67486819
// Data with swapped endian from a load instruction
67496820
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
67506821
{
@@ -6889,6 +6960,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
68896960
else
68906961
set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
68916962
}
6963+
#endif
68926964

68936965
void MPYA(spu_opcode_t op)
68946966
{

0 commit comments

Comments
 (0)