From 3109e0176ae8b40d16ab12a72d9a50d5bd17f100 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 2 Apr 2026 21:26:11 -0700 Subject: [PATCH] ARM64EC: Optimize GPR and MM state setting I think the code improvement speaks for itself here. --- Source/Windows/ARM64EC/Module.cpp | 69 ++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/Source/Windows/ARM64EC/Module.cpp b/Source/Windows/ARM64EC/Module.cpp index 28c25cfc72..c761eedcf7 100644 --- a/Source/Windows/ARM64EC/Module.cpp +++ b/Source/Windows/ARM64EC/Module.cpp @@ -46,6 +46,7 @@ desc: Implements the ARM64EC BT module API using FEXCore #include "BTInterface.h" #include "Windows/Common/SHMStats.h" +#include #include #include #include @@ -311,32 +312,46 @@ static bool HandleUnalignedAccess(const ThreadCPUArea CPUArea, ARM64_NT_CONTEXT& return Result.has_value(); } -static void LoadStateFromECContext(FEXCore::Core::InternalThreadState* Thread, CONTEXT& Context) { +static void LoadStateFromECContext(FEXCore::Core::InternalThreadState* Thread, const CONTEXT& Context) { auto& State = Thread->CurrentFrame->State; if ((Context.ContextFlags & CONTEXT_INTEGER) == CONTEXT_INTEGER) { + // Ensure ordering. + static_assert(((offsetof(CONTEXT, Rax) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RAX); + static_assert(((offsetof(CONTEXT, Rcx) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RCX); + static_assert(((offsetof(CONTEXT, Rdx) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RDX); + static_assert(((offsetof(CONTEXT, Rbx) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RBX); + static_assert(((offsetof(CONTEXT, Rsp) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RSP); + static_assert(((offsetof(CONTEXT, Rbp) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RBP); + static_assert(((offsetof(CONTEXT, Rsi) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RSI); + static_assert(((offsetof(CONTEXT, Rdi) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RDI); + static_assert(((offsetof(CONTEXT, R8) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_R8); + static_assert(((offsetof(CONTEXT, R9) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_R9); + static_assert(((offsetof(CONTEXT, R10) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_R10); + static_assert(((offsetof(CONTEXT, R11) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_R11); + static_assert(((offsetof(CONTEXT, R12) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_R12); + static_assert(((offsetof(CONTEXT, R13) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_R13); + static_assert(((offsetof(CONTEXT, R14) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_R14); + static_assert(((offsetof(CONTEXT, R15) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_R15); + // General register state - State.gregs[FEXCore::X86State::REG_RAX] = Context.Rax; - State.gregs[FEXCore::X86State::REG_RCX] = Context.Rcx; - State.gregs[FEXCore::X86State::REG_RDX] = Context.Rdx; - State.gregs[FEXCore::X86State::REG_RBX] = Context.Rbx; - - State.gregs[FEXCore::X86State::REG_RSI] = Context.Rsi; - State.gregs[FEXCore::X86State::REG_RDI] = Context.Rdi; - State.gregs[FEXCore::X86State::REG_R8] = Context.R8; - State.gregs[FEXCore::X86State::REG_R9] = Context.R9; - State.gregs[FEXCore::X86State::REG_R10] = Context.R10; - State.gregs[FEXCore::X86State::REG_R11] = Context.R11; - State.gregs[FEXCore::X86State::REG_R12] = Context.R12; - State.gregs[FEXCore::X86State::REG_R13] = Context.R13; - State.gregs[FEXCore::X86State::REG_R14] = Context.R14; - State.gregs[FEXCore::X86State::REG_R15] = Context.R15; + auto Src = reinterpret_cast(&Context.Rax); + auto Dst = reinterpret_cast(&State.gregs[FEXCore::X86State::REG_RAX]); + + asm volatile(R"( + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[Src]], #64; + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[Dst]], #64; + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[Src]], #64; + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[Dst]], #64; + )" + : [Src] "+r"(Src), [Dst] "+r"(Dst)::"memory", "v0", "v1", "v2", "v3"); } if ((Context.ContextFlags & CONTEXT_CONTROL) == CONTEXT_CONTROL) { State.rip = Context.Rip; - State.gregs[FEXCore::X86State::REG_RSP] = Context.Rsp; - State.gregs[FEXCore::X86State::REG_RBP] = Context.Rbp; + static_assert(((offsetof(CONTEXT, Rsp) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RSP); + static_assert(((offsetof(CONTEXT, Rbp) - offsetof(CONTEXT, Rax)) / sizeof(uint64_t)) == FEXCore::X86State::REG_RBP); + memcpy(&State.gregs[FEXCore::X86State::REG_RSP], &Context.Rsp, sizeof(uint64_t) * 2); CTX->SetFlagsFromCompactedEFLAGS(Thread, Context.EFlags); } @@ -364,13 +379,27 @@ static void LoadStateFromECContext(FEXCore::Core::InternalThreadState* Thread, C if ((Context.ContextFlags & CONTEXT_FLOATING_POINT) == CONTEXT_FLOATING_POINT) { // Floating-point register state if ((Context.ContextFlags & CONTEXT_XSTATE) == CONTEXT_XSTATE) { - const auto* Ymm = RtlLocateExtendedFeature(reinterpret_cast(&Context + 1), XSTATE_AVX, nullptr); + auto Ymm = RtlLocateExtendedFeature(const_cast(reinterpret_cast(&Context + 1)), XSTATE_AVX, nullptr); CTX->SetXMMRegistersFromState(Thread, reinterpret_cast(Context.FltSave.XmmRegisters), reinterpret_cast(Ymm)); } else { CTX->SetXMMRegistersFromState(Thread, reinterpret_cast(Context.FltSave.XmmRegisters), nullptr); } - memcpy(State.mm, Context.FltSave.FloatRegisters, sizeof(State.mm)); + + // Sanity check to make sure padding is correct. + static_assert(sizeof(State.mm[0]) == 16); + + // X87 registers + auto Src = reinterpret_cast(Context.FltSave.FloatRegisters); + auto Dst = reinterpret_cast(State.mm); + + asm volatile(R"( + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[Src]], #64; + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[Dst]], #64; + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[Src]], #64; + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[Dst]], #64; + )" + : [Src] "+r"(Src), [Dst] "+r"(Dst)::"memory", "v0", "v1", "v2", "v3"); State.FCW = Context.FltSave.ControlWord; State.flags[FEXCore::X86State::X87FLAG_IE_LOC] = Context.FltSave.StatusWord & 1;