Skip to content

Commit 7e54a0b

Browse files
committed
PPU/SPU LLVM: Use arm shuffles in recompilers instead of emulating x86 pshufb
> - SHUFB from 9 instructions down to 5 > - Though it should be 4 if LLVM would just emit BCAX...
1 parent 0e2584f commit 7e54a0b

3 files changed

Lines changed: 188 additions & 0 deletions

File tree

rpcs3/Emu/CPU/CPUTranslator.h

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3936,6 +3936,109 @@ class cpu_translator
39363936
});
39373937
}
39383938

3939+
#ifdef ARCH_ARM64
3940+
template <typename T1, typename T2>
3941+
value_t<u8[16]> tbl(T1 a, T2 b)
3942+
{
3943+
value_t<u8[16]> result;
3944+
const auto data0 = a.eval(m_ir);
3945+
const auto index = b.eval(m_ir);
3946+
const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
3947+
3948+
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
3949+
{
3950+
v128 mask{};
3951+
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
3952+
3953+
if (cv)
3954+
{
3955+
for (u32 i = 0; i < 16; i++)
3956+
{
3957+
const u64 b_val = cv->getElementAsInteger(i);
3958+
mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
3959+
}
3960+
}
3961+
3962+
3963+
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
3964+
{
3965+
result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
3966+
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
3967+
result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
3968+
return result;
3969+
}
3970+
}
3971+
3972+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
3973+
return result;
3974+
}
3975+
3976+
template <typename T1, typename T2, typename T3>
3977+
value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
3978+
{
3979+
value_t<u8[16]> result;
3980+
const auto data0 = a.eval(m_ir);
3981+
const auto data1 = b.eval(m_ir);
3982+
const auto index = indices.eval(m_ir);
3983+
3984+
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
3985+
{
3986+
v128 mask{};
3987+
v128 bitmask{};
3988+
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
3989+
3990+
if (cv)
3991+
{
3992+
for (u32 i = 0; i < 16; i++)
3993+
{
3994+
const u64 b_val = cv->getElementAsInteger(i);
3995+
mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
3996+
bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
3997+
}
3998+
}
3999+
4000+
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
4001+
{
4002+
auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
4003+
auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
4004+
auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
4005+
4006+
auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
4007+
result.value = m_ir->CreateAnd(lookup, z_mask);
4008+
return result;
4009+
}
4010+
}
4011+
4012+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
4013+
return result;
4014+
}
4015+
4016+
template <typename T1, typename T2, typename T3>
4017+
value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
4018+
{
4019+
value_t<u8[16]> result;
4020+
const auto v_fallback = fallback.eval(m_ir);
4021+
const auto data0 = a.eval(m_ir);
4022+
const auto index = indices.eval(m_ir);
4023+
4024+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
4025+
return result;
4026+
}
4027+
4028+
template <typename T1, typename T2, typename T3, typename T4>
4029+
value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
4030+
{
4031+
value_t<u8[16]> result;
4032+
const auto v_fallback = fallback.eval(m_ir);
4033+
const auto data0 = a.eval(m_ir);
4034+
const auto data1 = b.eval(m_ir);
4035+
const auto index = indices.eval(m_ir);
4036+
4037+
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
4038+
return result;
4039+
}
4040+
#endif
4041+
39394042
// (m << 3) >= 0 ? a : b
39404043
template <typename T, typename U, typename V>
39414044
static auto select_by_bit4(T&& m, U&& a, V&& b)

rpcs3/Emu/Cell/PPUTranslator.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1641,6 +1641,18 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
16411641
{
16421642
const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);
16431643

1644+
#ifdef ARCH_ARM64
1645+
1646+
if (op.ra == op.rb)
1647+
{
1648+
set_vr(op.vd, tbl(a, (~c & 0xf)));
1649+
return;
1650+
}
1651+
1652+
set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));
1653+
}
1654+
#else
1655+
16441656
if (op.ra == op.rb)
16451657
{
16461658
set_vr(op.vd, pshufb(a, ~c & 0xf));
@@ -1657,6 +1669,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
16571669
const auto i = eval(~c & 0x1f);
16581670
set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
16591671
}
1672+
#endif
16601673

16611674
void PPUTranslator::VPKPX(ppu_opcode_t op)
16621675
{

rpcs3/Emu/Cell/SPULLVMRecompiler.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5918,6 +5918,77 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
59185918
const auto a = get_vr<u8[16]>(op.ra);
59195919
const auto b = get_vr<u8[16]>(op.rb);
59205920

5921+
#ifdef ARCH_ARM64
5922+
5923+
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
5924+
{
5925+
if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
5926+
{
5927+
if (op.ra == op.rb)
5928+
{
5929+
if (perm_only)
5930+
{
5931+
const auto cm = eval(c & 0x0f);
5932+
set_vr(op.rt4, tbl(as, cm));
5933+
return;
5934+
}
5935+
5936+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
5937+
const auto cm = eval(c & 0x8f);
5938+
set_vr(op.rt4, tbx(x, as, cm));
5939+
return;
5940+
}
5941+
5942+
if (perm_only)
5943+
{
5944+
const auto cm = eval(c & 0x1f);
5945+
set_vr(op.rt4, tbl2(as, bs, cm));
5946+
return;
5947+
}
5948+
5949+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
5950+
const auto cm = eval(c & 0x9f);
5951+
set_vr(op.rt4, tbx2(x, as, bs, cm));
5952+
return;
5953+
}
5954+
5955+
}
5956+
5957+
5958+
if (op.ra == op.rb && !m_interp_magn)
5959+
{
5960+
if (perm_only)
5961+
{
5962+
const auto cm = eval(c & 0x0f);
5963+
const auto cr = eval(cm ^ 0x0f);
5964+
set_vr(op.rt4, tbl(a, cr));
5965+
return;
5966+
}
5967+
5968+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
5969+
const auto cm = eval(c & 0x8f);
5970+
const auto cr = eval(cm ^ 0x0f);
5971+
set_vr(op.rt4, tbx(x, a, cr));
5972+
return;
5973+
}
5974+
5975+
if (perm_only)
5976+
{
5977+
const auto cm = eval(c & 0x9f);
5978+
const auto cr = eval(cm ^ 0x0f);
5979+
set_vr(op.rt4, tbl2(a, b, cr));
5980+
return;
5981+
}
5982+
5983+
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
5984+
// AND should be before XOR so that llvm can combine them into BCAX
5985+
// Though for some reason it doesn't seem to be doing that.
5986+
const auto cm = eval(c & ~0x60);
5987+
const auto cr = eval(cm ^ 0x0f);
5988+
set_vr(op.rt4, tbx2(x, a, b, cr));
5989+
}
5990+
#else
5991+
59215992
// Data with swapped endian from a load instruction
59225993
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
59235994
{
@@ -6062,6 +6133,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
60626133
else
60636134
set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
60646135
}
6136+
#endif
60656137

60666138
void MPYA(spu_opcode_t op)
60676139
{

0 commit comments

Comments
 (0)