diff --git a/src/core/gte-instructions.cc b/src/core/gte-instructions.cc
new file mode 100644
index 000000000..c90515371
--- /dev/null
+++ b/src/core/gte-instructions.cc
@@ -0,0 +1,385 @@
+/***************************************************************************
+ *   Copyright (C) 2026 PCSX-Redux authors                                *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.           *
+ ***************************************************************************/
+
+// GTE instruction implementations and public dispatch methods.
+//
+// Each instruction is implemented as a template parameterized on sf (shift
+// factor) and lm (limit mode). The public methods decode these bits from
+// the instruction encoding and dispatch to the right instantiation.
+//
+// MVMVA is further templatized on mx, v, and cv for full compile-time
+// elimination of the matrix/vector selection branches.
+
+#include "core/gte.h"
+#include "core/gte-internal.h"
+#include "core/pgxp_debug.h"
+#include "core/pgxp_gte.h"
+
+using namespace PCSX::GTEImpl;
+
+// ============================================================================
+// Template instruction implementations
+// ============================================================================
+
+// RTPS core: perspective transform for vertex v.
+// When last=true, computes the depth queue interpolation at the end.
+template <bool sf, bool lm, int v>
+static void rtps(bool last) {
+    mac1() = A1<sf>(int44(trX() << 12) +
+                    r11() * vertexX<v>() + r12() * vertexY<v>() + r13() * vertexZ<v>());
+    mac2() = A2<sf>(int44(trY() << 12) +
+                    r21() * vertexX<v>() + r22() * vertexY<v>() + r23() * vertexZ<v>());
+    int64_t rawMac3;
+    mac3() = A3<sf>(int44(trZ() << 12) +
+                    r31() * vertexX<v>() + r32() * vertexY<v>() + r33() * vertexZ<v>(), rawMac3);
+
+    ir1() = limB1<lm>(mac1());
+    ir2() = limB2<lm>(mac2());
+    ir3() = limB3sf<sf, lm>(rawMac3);
+
+    pushZ(limD<true>(rawMac3));
+
+    int32_t hOverSz3 = gteDivide(gteH(), sz3());
+
+    sxy0() = sxy1();
+    sxy1() = sxy2();
+
+    double widescreenFactor = PCSX::g_emulator->config().Widescreen ? 0.75 : 1.0;
+    // ir1()*hOverSz3 can exceed int32_t (hOverSz3 is up to 0x1FFFF), so widen ir first
+    sx2() = limG1(F(gteOFX() + (int64_t)ir1() * hOverSz3 * widescreenFactor) >> 16);
+    sy2() = limG2(F(gteOFY() + (int64_t)ir2() * hOverSz3) >> 16);
+
+    PGXP_pushSXYZ2s(limG1ia(gteOFX() + (int64_t)ir1() * hOverSz3 * widescreenFactor),
+                     limG2ia(gteOFY() + (int64_t)ir2() * hOverSz3),
+                     std::max((int)sz3(), gteH() / 2), sxy2());
+
+    if (last) {
+        int64_t rawMac0;
+        mac0() = F(gteDQB() + gteDQA() * hOverSz3, rawMac0);
+        ir0() = limH(rawMac0);
+    }
+}
+
+// OP: outer product using rotation matrix diagonal
+template <bool sf, bool lm>
+void PCSX::GTE::op(uint32_t op) {
+    gteFlag() = 0;
+    mac1() = A1<sf>(r22() * ir3() - r33() * ir2());
+    mac2() = A2<sf>(r33() * ir1() - r11() * ir3());
+    mac3() = A3<sf>(r11() * ir2() - r22() * ir1());
+    ir1() = limB1<lm>(mac1());
+    ir2() = limB2<lm>(mac2());
+    ir3() = limB3<lm>(mac3());
+}
+
+template <bool sf, bool lm>
+void PCSX::GTE::dpcs(uint32_t op) {
+    gteFlag() = 0;
+    depthCue<sf, lm>(rgbR() << 16, rgbG() << 16, rgbB() << 16);
+    pushColor();
+}
+
+template <bool sf, bool lm>
+void PCSX::GTE::intpl(uint32_t op) {
+    gteFlag() = 0;
+    depthCue<sf, lm>(ir1() << 12, ir2() << 12, ir3() << 12);
+    pushColor();
+}
+
+// MVMVA: fully templatized wrapper for dispatch table
+template <bool sf, bool lm, int mx, int v, int cv>
+static void mvmvaImpl() {
+    gteFlag() = 0;
+    matrixVectorMultiply<sf, lm, mx, v, cv>();
+}
+
+// NCDS core: used by NCDS (v=0) and NCDT (v=0,1,2)
+template <bool sf, bool lm, int v>
+static void ncdsCore() {
+    lightTransform<sf, lm, v>();
+    colorMatrix<sf, lm>();
+    depthCueColor<sf, lm>();
+    pushColor();
+}
+
+template <bool sf, bool lm>
+void PCSX::GTE::cdp(uint32_t op) {
+    gteFlag() = 0;
+    colorMatrix<sf, lm>();
+    depthCueColor<sf, lm>();
+    pushColor();
+}
+
+// NCCS core: used by NCCS (v=0) and NCCT (v=0,1,2)
+template <bool sf, bool lm, int v>
+static void nccsCore() {
+    lightTransform<sf, lm, v>();
+    colorMatrix<sf, lm>();
+    colorApply<sf, lm>();
+    pushColor();
+}
+
+template <bool sf, bool lm>
+void PCSX::GTE::cc(uint32_t op) {
+    gteFlag() = 0;
+    colorMatrix<sf, lm>();
+    colorApply<sf, lm>();
+    pushColor();
+}
+
+// NCS core: used by NCS (v=0) and NCT (v=0,1,2)
+template <bool sf, bool lm, int v>
+static void ncsCore() {
+    lightTransform<sf, lm, v>();
+    colorMatrix<sf, lm>();
+    pushColor();
+}
+
+template <bool sf, bool lm>
+void PCSX::GTE::sqr(uint32_t op) {
+    gteFlag() = 0;
+    mac1() = A1<sf>(ir1() * ir1());
+    mac2() = A2<sf>(ir2() * ir2());
+    mac3() = A3<sf>(ir3() * ir3());
+    ir1() = limB1<lm>(mac1());
+    ir2() = limB2<lm>(mac2());
+    ir3() = limB3<lm>(mac3());
+}
+
+template <bool sf, bool lm>
+void PCSX::GTE::dcpl(uint32_t op) {
+    gteFlag() = 0;
+    depthCueColor<sf, lm>();
+    pushColor();
+}
+
+template <bool sf, bool lm>
+void PCSX::GTE::dpct(uint32_t op) {
+    gteFlag() = 0;
+    for (int v = 0; v < 3; v++) {
+        depthCue<sf, lm>(rgb0R() << 16, rgb0G() << 16, rgb0B() << 16);
+        pushColor();
+    }
+}
+
+template <bool sf, bool lm>
+void PCSX::GTE::gpf(uint32_t op) {
+    gteFlag() = 0;
+    mac1() = A1<sf>(ir0() * ir1());
+    mac2() = A2<sf>(ir0() * ir2());
+    mac3() = A3<sf>(ir0() * ir3());
+    ir1() = limB1<lm>(mac1());
+    ir2() = limB2<lm>(mac2());
+    ir3() = limB3<lm>(mac3());
+    pushColor();
+}
+
+template <bool sf, bool lm>
+void PCSX::GTE::gpl(uint32_t op) {
+    gteFlag() = 0;
+    int64_t shiftedMac1, shiftedMac2, shiftedMac3;
+    if constexpr (sf) {
+        shiftedMac1 = (int64_t)mac1() << 12;  // <<12 on int32_t overflows
+        shiftedMac2 = (int64_t)mac2() << 12;
+        shiftedMac3 = (int64_t)mac3() << 12;
+    } else {
+        shiftedMac1 = mac1();
+        shiftedMac2 = mac2();
+        shiftedMac3 = mac3();
+    }
+    mac1() = A1<sf>(shiftedMac1 + ir0() * ir1());
+    mac2() = A2<sf>(shiftedMac2 + ir0() * ir2());
+    mac3() = A3<sf>(shiftedMac3 + ir0() * ir3());
+    ir1() = limB1<lm>(mac1());
+    ir2() = limB2<lm>(mac2());
+    ir3() = limB3<lm>(mac3());
+    pushColor();
+}
+
+// ============================================================================
+// MVMVA dispatch table (256 entries: sf * lm * mx * v * cv)
+// ============================================================================
+
+namespace {
+
+template <bool sf, bool lm, int mx, int v, int cv>
+struct MvmvaEntry {
+    static void fn() { mvmvaImpl<sf, lm, mx, v, cv>(); }
+};
+
+using MvmvaFn = void (*)();
+
+constexpr auto mvmvaTable =
+    PCSX::GTEImpl::makeMvmvaTable<MvmvaFn, MvmvaEntry>(std::make_index_sequence<256>{});
+
+}  // anonymous namespace
+
+// ============================================================================
+// Public dispatch methods
+// ============================================================================
+
+#define GTE_DISPATCH_SF_LM(method, ...)                                    \
+    do {                                                                    \
+        uint32_t _op = code & 0x1ffffff;                                    \
+        switch (sfLmIndex(_op)) {                                           \
+            case 0: method<false, false>(_op, ##__VA_ARGS__); break;        \
+            case 1: method<false, true>(_op, ##__VA_ARGS__); break;         \
+            case 2: method<true, false>(_op, ##__VA_ARGS__); break;         \
+            case 3: method<true, true>(_op, ##__VA_ARGS__); break;          \
+        }                                                                   \
+    } while (0)
+
+void PCSX::GTE::RTPS(uint32_t code) {
+    uint32_t _op = code & 0x1ffffff;
+    gteFlag() = 0;
+    switch (sfLmIndex(_op)) {
+        case 0: rtps<false, false, 0>(true); break;
+        case 1: rtps<false, true, 0>(true); break;
+        case 2: rtps<true, false, 0>(true); break;
+        case 3: rtps<true, true, 0>(true); break;
+    }
+}
+
+void PCSX::GTE::RTPT(uint32_t code) {
+    uint32_t _op = code & 0x1ffffff;
+    gteFlag() = 0;
+    switch (sfLmIndex(_op)) {
+        case 0: rtps<false, false, 0>(false); rtps<false, false, 1>(false); rtps<false, false, 2>(true); break;
+        case 1: rtps<false, true, 0>(false); rtps<false, true, 1>(false); rtps<false, true, 2>(true); break;
+        case 2: rtps<true, false, 0>(false); rtps<true, false, 1>(false); rtps<true, false, 2>(true); break;
+        case 3: rtps<true, true, 0>(false); rtps<true, true, 1>(false); rtps<true, true, 2>(true); break;
+    }
+}
+
+void PCSX::GTE::NCLIP(uint32_t code) {
+    gteFlag() = 0;
+    if (PGXP_NLCIP_valid(sxy0(), sxy1(), sxy2()))
+        mac0() = F(PGXP_NCLIP());
+    else
+        mac0() = F((int64_t)sx0() * sy1() + sx1() * sy2() + sx2() * sy0() -
+                    sx0() * sy2() - sx1() * sy0() - sx2() * sy1());
+}
+
+void PCSX::GTE::OP(uint32_t code) { GTE_DISPATCH_SF_LM(op); }
+void PCSX::GTE::DPCS(uint32_t code) { GTE_DISPATCH_SF_LM(dpcs); }
+void PCSX::GTE::INTPL(uint32_t code) { GTE_DISPATCH_SF_LM(intpl); }
+
+void PCSX::GTE::MVMVA(uint32_t code) {
+    uint32_t _op = code & 0x1ffffff;
+    unsigned sf = (_op >> 19) & 1;
+    unsigned lm = (_op >> 10) & 1;
+    unsigned mx = (_op >> 17) & 3;
+    unsigned v = (_op >> 15) & 3;
+    unsigned cv = (_op >> 13) & 3;
+    unsigned idx = (sf << 7) | (lm << 6) | (mx << 4) | (v << 2) | cv;
+    mvmvaTable[idx]();
+}
+
+void PCSX::GTE::NCDS(uint32_t code) {
+    uint32_t _op = code & 0x1ffffff;
+    gteFlag() = 0;
+    switch (sfLmIndex(_op)) {
+        case 0: ncdsCore<false, false, 0>(); break;
+        case 1: ncdsCore<false, true, 0>(); break;
+        case 2: ncdsCore<true, false, 0>(); break;
+        case 3: ncdsCore<true, true, 0>(); break;
+    }
+}
+
+void PCSX::GTE::CDP(uint32_t code) { GTE_DISPATCH_SF_LM(cdp); }
+
+void PCSX::GTE::NCDT(uint32_t code) {
+    uint32_t _op = code & 0x1ffffff;
+    gteFlag() = 0;
+    switch (sfLmIndex(_op)) {
+        case 0: ncdsCore<false, false, 0>(); ncdsCore<false, false, 1>(); ncdsCore<false, false, 2>(); break;
+        case 1: ncdsCore<false, true, 0>(); ncdsCore<false, true, 1>(); ncdsCore<false, true, 2>(); break;
+        case 2: ncdsCore<true, false, 0>(); ncdsCore<true, false, 1>(); ncdsCore<true, false, 2>(); break;
+        case 3: ncdsCore<true, true, 0>(); ncdsCore<true, true, 1>(); ncdsCore<true, true, 2>(); break;
+    }
+}
+
+void PCSX::GTE::NCCS(uint32_t code) {
+    uint32_t _op = code & 0x1ffffff;
+    gteFlag() = 0;
+    switch (sfLmIndex(_op)) {
+        case 0: nccsCore<false, false, 0>(); break;
+        case 1: nccsCore<false, true, 0>(); break;
+        case 2: nccsCore<true, false, 0>(); break;
+        case 3: nccsCore<true, true, 0>(); break;
+    }
+}
+
+void PCSX::GTE::CC(uint32_t code) { GTE_DISPATCH_SF_LM(cc); }
+
+void PCSX::GTE::NCS(uint32_t code) {
+    uint32_t _op = code & 0x1ffffff;
+    gteFlag() = 0;
+    switch (sfLmIndex(_op)) {
+        case 0: ncsCore<false, false, 0>(); break;
+        case 1: ncsCore<false, true, 0>(); break;
+        case 2: ncsCore<true, false, 0>(); break;
+        case 3: ncsCore<true, true, 0>(); break;
+    }
+}
+
+void PCSX::GTE::NCT(uint32_t code) {
+    uint32_t _op = code & 0x1ffffff;
+    gteFlag() = 0;
+    switch (sfLmIndex(_op)) {
+        case 0: ncsCore<false, false, 0>(); ncsCore<false, false, 1>(); ncsCore<false, false, 2>(); break;
+        case 1: ncsCore<false, true, 0>(); ncsCore<false, true, 1>(); ncsCore<false, true, 2>(); break;
+        case 2: ncsCore<true, false, 0>(); ncsCore<true, false, 1>(); ncsCore<true, false, 2>(); break;
+        case 3: ncsCore<true, true, 0>(); ncsCore<true, true, 1>(); ncsCore<true, true, 2>(); break;
+    }
+}
+
+void PCSX::GTE::SQR(uint32_t code) { GTE_DISPATCH_SF_LM(sqr); }
+void PCSX::GTE::DCPL(uint32_t code) { GTE_DISPATCH_SF_LM(dcpl); }
+void PCSX::GTE::DPCT(uint32_t code) { GTE_DISPATCH_SF_LM(dpct); }
+
+void PCSX::GTE::AVSZ3(uint32_t code) {
+    gteFlag() = 0;
+    int64_t rawMac0;
+    mac0() = F(gteZSF3() * sz1() + gteZSF3() * sz2() + gteZSF3() * sz3(), rawMac0);
+    otz() = limD<true>(rawMac0);
+}
+
+void PCSX::GTE::AVSZ4(uint32_t code) {
+    gteFlag() = 0;
+    int64_t rawMac0;
+    mac0() = F(gteZSF4() * sz0() + gteZSF4() * sz1() + gteZSF4() * sz2() + gteZSF4() * sz3(), rawMac0);
+    otz() = limD<true>(rawMac0);
+}
+
+void PCSX::GTE::GPF(uint32_t code) { GTE_DISPATCH_SF_LM(gpf); }
+void PCSX::GTE::GPL(uint32_t code) { GTE_DISPATCH_SF_LM(gpl); }
+
+void PCSX::GTE::NCCT(uint32_t code) {
+    uint32_t _op = code & 0x1ffffff;
+    gteFlag() = 0;
+    switch (sfLmIndex(_op)) {
+        case 0: nccsCore<false, false, 0>(); nccsCore<false, false, 1>(); nccsCore<false, false, 2>(); break;
+        case 1: nccsCore<false, true, 0>(); nccsCore<false, true, 1>(); nccsCore<false, true, 2>(); break;
+        case 2: nccsCore<true, false, 0>(); nccsCore<true, false, 1>(); nccsCore<true, false, 2>(); break;
+        case 3: nccsCore<true, true, 0>(); nccsCore<true, true, 1>(); nccsCore<true, true, 2>(); break;
+    }
+}
+
+#undef GTE_DISPATCH_SF_LM
diff --git a/src/core/gte-internal.h b/src/core/gte-internal.h
new file mode 100644
index 000000000..71a27c407
--- /dev/null
+++ b/src/core/gte-internal.h
@@ -0,0 +1,480 @@
+/***************************************************************************
+ *   Copyright (C) 2026 PCSX-Redux authors                                *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.           *
+ ***************************************************************************/
+
+// GTE internal implementation header.
+//
+// Shared by gte-transfer.cc and gte-instructions.cc. Not part of the public
+// interface. Contains register accessors, arithmetic helpers, limiter functions,
+// and pipeline stage templates - everything that the GTE instruction
+// implementations need but callers of the GTE class do not.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+#include "core/gte.h"
+#include "core/psxemulator.h"
+#include "core/r3000a.h"
+#include "support/table-generator.h"
+
+namespace PCSX {
+namespace GTEImpl {
+
+// ============================================================================
+// 44-bit accumulator with per-addition overflow tracking
+// ============================================================================
+
+class int44 {
+  public:
+    int44(int64_t value)
+        : m_value(value),
+          m_posOverflow(value > INT64_C(0x7ffffffffff)),
+          m_negOverflow(value < INT64_C(-0x80000000000)) {}
+
+    int44(int64_t value, bool posOverflow, bool negOverflow)
+        : m_value(value), m_posOverflow(posOverflow), m_negOverflow(negOverflow) {}
+
+    int44 operator+(int64_t rhs) const {
+        int64_t result = ((m_value + rhs) << 20) >> 20;
+        return int44(result, m_posOverflow || (result < 0 && m_value >= 0 && rhs >= 0),
+                     m_negOverflow || (result >= 0 && m_value < 0 && rhs < 0));
+    }
+
+    bool positiveOverflow() const { return m_posOverflow; }
+    bool negativeOverflow() const { return m_negOverflow; }
+    int64_t value() const { return m_value; }
+
+  private:
+    int64_t m_value;
+    bool m_posOverflow;
+    bool m_negOverflow;
+};
+
+// ============================================================================
+// FLAG register bit definitions
+// ============================================================================
+
+namespace Flag {
+constexpr uint32_t GTE_ERROR = 1u << 31;
+constexpr uint32_t MAC1_POS = GTE_ERROR | (1u << 30);
+constexpr uint32_t MAC1_NEG = GTE_ERROR | (1u << 27);
+constexpr uint32_t MAC2_POS = GTE_ERROR | (1u << 29);
+constexpr uint32_t MAC2_NEG = GTE_ERROR | (1u << 26);
+constexpr uint32_t MAC3_POS = GTE_ERROR | (1u << 28);
+constexpr uint32_t MAC3_NEG = GTE_ERROR | (1u << 25);
+constexpr uint32_t IR1_SAT = GTE_ERROR | (1u << 24);
+constexpr uint32_t IR2_SAT = GTE_ERROR | (1u << 23);
+constexpr uint32_t IR3_SAT = 1u << 22;
+constexpr uint32_t COLOR_R_SAT = 1u << 21;
+constexpr uint32_t COLOR_G_SAT = 1u << 20;
+constexpr uint32_t COLOR_B_SAT = 1u << 19;
+constexpr uint32_t SZ_SAT = GTE_ERROR | (1u << 18);
+constexpr uint32_t DIV_OVER = GTE_ERROR | (1u << 17);
+constexpr uint32_t MAC0_POS = GTE_ERROR | (1u << 16);
+constexpr uint32_t MAC0_NEG = GTE_ERROR | (1u << 15);
+constexpr uint32_t SX_SAT = GTE_ERROR | (1u << 14);
+constexpr uint32_t SY_SAT = GTE_ERROR | (1u << 13);
+constexpr uint32_t IR0_SAT = 1u << 12;
+constexpr uint32_t ERROR_BITS = 0x7f87e000u;
+}  // namespace Flag
+
+// ============================================================================
+// Register access
+// ============================================================================
+
+inline PAIR* dataRegs() { return g_emulator->m_cpu->m_regs.CP2D.p; }
+inline PAIR* ctrlRegs() { return g_emulator->m_cpu->m_regs.CP2C.p; }
+
+// Vertex vectors: compile-time v selection
+template <int v>
+inline int16_t vertexX() {
+    if constexpr (v < 3) return dataRegs()[v * 2].sw.l;
+    else return dataRegs()[9].sw.l;
+}
+template <int v>
+inline int16_t vertexY() {
+    if constexpr (v < 3) return dataRegs()[v * 2].sw.h;
+    else return dataRegs()[10].sw.l;
+}
+template <int v>
+inline int16_t vertexZ() {
+    if constexpr (v < 3) return dataRegs()[v * 2 + 1].sw.l;
+    else return dataRegs()[11].sw.l;
+}
+
+// RGBC
+inline uint8_t& rgbR() { return dataRegs()[6].b.l; }
+inline uint8_t& rgbG() { return dataRegs()[6].b.h; }
+inline uint8_t& rgbB() { return dataRegs()[6].b.h2; }
+inline uint8_t& rgbCode() { return dataRegs()[6].b.h3; }
+
+inline uint16_t& otz() { return dataRegs()[7].w.l; }
+
+inline int16_t& ir0() { return dataRegs()[8].sw.l; }
+inline int16_t& ir1() { return dataRegs()[9].sw.l; }
+inline int16_t& ir2() { return dataRegs()[10].sw.l; }
+inline int16_t& ir3() { return dataRegs()[11].sw.l; }
+
+inline uint32_t& sxy0() { return dataRegs()[12].d; }
+inline int16_t& sx0() { return dataRegs()[12].sw.l; }
+inline int16_t& sy0() { return dataRegs()[12].sw.h; }
+inline uint32_t& sxy1() { return dataRegs()[13].d; }
+inline int16_t& sx1() { return dataRegs()[13].sw.l; }
+inline int16_t& sy1() { return dataRegs()[13].sw.h; }
+inline uint32_t& sxy2() { return dataRegs()[14].d; }
+inline int16_t& sx2() { return dataRegs()[14].sw.l; }
+inline int16_t& sy2() { return dataRegs()[14].sw.h; }
+
+inline uint16_t& sz0() { return dataRegs()[16].w.l; }
+inline uint16_t& sz1() { return dataRegs()[17].w.l; }
+inline uint16_t& sz2() { return dataRegs()[18].w.l; }
+inline uint16_t& sz3() { return dataRegs()[19].w.l; }
+
+inline uint32_t& rgb0() { return dataRegs()[20].d; }
+inline uint8_t& rgb0R() { return dataRegs()[20].b.l; }
+inline uint8_t& rgb0G() { return dataRegs()[20].b.h; }
+inline uint8_t& rgb0B() { return dataRegs()[20].b.h2; }
+inline uint32_t& rgb1() { return dataRegs()[21].d; }
+inline uint32_t& rgb2() { return dataRegs()[22].d; }
+inline uint8_t& rgb2R() { return dataRegs()[22].b.l; }
+inline uint8_t& rgb2G() { return dataRegs()[22].b.h; }
+inline uint8_t& rgb2B() { return dataRegs()[22].b.h2; }
+inline uint8_t& rgb2Cd() { return dataRegs()[22].b.h3; }
+
+inline int32_t& mac0() { return dataRegs()[24].sd; }
+inline int32_t& mac1() { return dataRegs()[25].sd; }
+inline int32_t& mac2() { return dataRegs()[26].sd; }
+inline int32_t& mac3() { return dataRegs()[27].sd; }
+
+// Control registers - rotation matrix
+inline int16_t r11() { return ctrlRegs()[0].sw.l; }
+inline int16_t r12() { return ctrlRegs()[0].sw.h; }
+inline int16_t r13() { return ctrlRegs()[1].sw.l; }
+inline int16_t r21() { return ctrlRegs()[1].sw.h; }
+inline int16_t r22() { return ctrlRegs()[2].sw.l; }
+inline int16_t r23() { return ctrlRegs()[2].sw.h; }
+inline int16_t r31() { return ctrlRegs()[3].sw.l; }
+inline int16_t r32() { return ctrlRegs()[3].sw.h; }
+inline int16_t r33() { return ctrlRegs()[4].sw.l; }
+
+// Control registers used in 64-bit arithmetic return int64_t to avoid casts at every use site.
+// The underlying storage is 32-bit or 16-bit; the widening happens here, once.
+inline int64_t trX() { return ctrlRegs()[5].sd; }
+inline int64_t trY() { return ctrlRegs()[6].sd; }
+inline int64_t trZ() { return ctrlRegs()[7].sd; }
+inline int64_t rbk() { return ctrlRegs()[13].sd; }
+inline int64_t gbk() { return ctrlRegs()[14].sd; }
+inline int64_t bbk() { return ctrlRegs()[15].sd; }
+inline int64_t rfc() { return ctrlRegs()[21].sd; }
+inline int64_t gfc() { return ctrlRegs()[22].sd; }
+inline int64_t bfc() { return ctrlRegs()[23].sd; }
+inline int64_t gteOFX() { return ctrlRegs()[24].sd; }
+inline int64_t gteOFY() { return ctrlRegs()[25].sd; }
+inline int16_t gteH() { return ctrlRegs()[26].sw.l; }  // stays 16-bit for gteDivide signature
+inline int64_t gteDQA() { return ctrlRegs()[27].sw.l; }
+inline int64_t gteDQB() { return ctrlRegs()[28].sd; }
+inline int64_t gteZSF3() { return ctrlRegs()[29].sw.l; }
+inline int64_t gteZSF4() { return ctrlRegs()[30].sw.l; }
+inline uint32_t& gteFlag() { return ctrlRegs()[31].d; }
+
+// Matrix element access - compile-time (mx, row, col)
+template <int mx, int row, int col>
+inline int32_t matrixElement() {
+    if constexpr (mx < 3) {
+        constexpr int linear = row * 3 + col;
+        constexpr int regIdx = mx * 8 + linear / 2;
+        if constexpr (linear & 1) return ctrlRegs()[regIdx].sw.h;
+        else return ctrlRegs()[regIdx].sw.l;
+    } else {
+        // Garbage matrix: {-R<<4, R<<4, IR0, R13, R13, R13, R22, R22, R22}
+        constexpr int linear = row * 3 + col;
+        if constexpr (linear == 0) { return (-static_cast<int32_t>(dataRegs()[6].b.l)) << 4; }
+        else if constexpr (linear == 1) { return static_cast<int32_t>(dataRegs()[6].b.l) << 4; }
+        else if constexpr (linear == 2) { return ir0(); }
+        else if constexpr (linear <= 5) { return ctrlRegs()[1].sw.l; }  // R13
+        else { return ctrlRegs()[2].sw.l; }                              // R22
+    }
+}
+
+// Control vector component - compile-time (cv, component)
+template <int cv, int component>
+inline int64_t controlVector() {
+    if constexpr (cv == 3) return 0;
+    else return ctrlRegs()[cv * 8 + 5 + component].sd;
+}
+
+// ============================================================================
+// Division
+// ============================================================================
+
+// UNR reciprocal table generator for GTE division.
+// Formula from hardware: unrTable[i] = max(0, ((0x40000 / (i + 0x100)) + 1) / 2 - 0x101)
+struct UNRGenerator {
+    static consteval uint8_t calculateValue(size_t i) {
+        int val = ((0x40000 / (int)(i + 0x100)) + 1) / 2 - 0x101;
+        return static_cast<uint8_t>(val < 0 ? 0 : val);
+    }
+};
+
+inline constexpr auto unrTable = generateTable<257, UNRGenerator>();
+
+inline uint32_t gteDivide(uint16_t numerator, uint16_t denominator) {
+    if (numerator >= denominator * 2) {
+        gteFlag() |= Flag::DIV_OVER;
+        return 0x1ffff;
+    }
+
+    int shift = GTE::countLeadingZeros16(denominator);
+    int r1 = (denominator << shift) & 0x7fff;
+    int r2 = unrTable[((r1 + 0x40) >> 7)] + 0x101;
+    int r3 = ((0x80 - (r2 * (r1 + 0x8000))) >> 8) & 0x1ffff;
+    uint32_t reciprocal = ((r2 * r3) + 0x80) >> 8;
+    uint32_t result = ((static_cast<uint64_t>(reciprocal) * (numerator << shift)) + 0x8000) >> 16;
+    return std::min<uint32_t>(0x1ffff, result);
+}
+
+// ============================================================================
+// Limiter functions
+// ============================================================================
+
+inline int32_t lim(int32_t value, int32_t max, int32_t min, uint32_t flag) {
+    if (value > max) { gteFlag() |= flag; return max; }
+    if (value < min) { gteFlag() |= flag; return min; }
+    return value;
+}
+
+template <bool sf>
+inline int64_t gteShift(int64_t a) {
+    if constexpr (sf) return a >> 12;
+    else return a;
+}
+
+template <bool sf>
+inline int32_t bounds(int44 value, uint32_t posFlag, uint32_t negFlag) {
+    if (value.positiveOverflow()) gteFlag() |= posFlag;
+    if (value.negativeOverflow()) gteFlag() |= negFlag;
+    return static_cast<int32_t>(gteShift<sf>(value.value()));
+}
+
+template <bool sf>
+inline int32_t A1(int44 a) { return bounds<sf>(a, Flag::MAC1_POS, Flag::MAC1_NEG); }
+
+template <bool sf>
+inline int32_t A2(int44 a) { return bounds<sf>(a, Flag::MAC2_POS, Flag::MAC2_NEG); }
+
+template <bool sf>
+inline int32_t A3(int44 a, int64_t& rawOut) {
+    rawOut = a.value();
+    return bounds<sf>(a, Flag::MAC3_POS, Flag::MAC3_NEG);
+}
+
+template <bool sf>
+inline int32_t A3(int44 a) { return bounds<sf>(a, Flag::MAC3_POS, Flag::MAC3_NEG); }
+
+inline int64_t F(int64_t a, int64_t& rawOut) {
+    rawOut = a;
+    if (a > INT64_C(0x7fffffff)) gteFlag() |= Flag::MAC0_POS;
+    if (a < INT64_C(-0x80000000)) gteFlag() |= Flag::MAC0_NEG;
+    return a;
+}
+
+inline int64_t F(int64_t a) {
+    if (a > INT64_C(0x7fffffff)) gteFlag() |= Flag::MAC0_POS;
+    if (a < INT64_C(-0x80000000)) gteFlag() |= Flag::MAC0_NEG;
+    return a;
+}
+
+template <bool lm> inline int32_t limB1(int32_t a) { return lim(a, 0x7fff, lm ? 0 : -0x8000, Flag::IR1_SAT); }
+template <bool lm> inline int32_t limB2(int32_t a) { return lim(a, 0x7fff, lm ? 0 : -0x8000, Flag::IR2_SAT); }
+template <bool lm> inline int32_t limB3(int32_t a) { return lim(a, 0x7fff, lm ? 0 : -0x8000, Flag::IR3_SAT); }
+
+template <bool sf, bool lm>
+inline int32_t limB3sf(int64_t rawMac3) {
+    int32_t valueSf = static_cast<int32_t>(gteShift<sf>(rawMac3));
+    int32_t value12 = static_cast<int32_t>(rawMac3 >> 12);
+    constexpr int32_t min = lm ? 0 : -0x8000;
+    if (value12 < -0x8000 || value12 > 0x7fff) gteFlag() |= Flag::IR3_SAT;
+    return std::clamp<int32_t>(valueSf, min, 0x7fff);
+}
+
+inline int32_t limC1(int32_t a) { return lim(a, 0xff, 0, Flag::COLOR_R_SAT); }
+inline int32_t limC2(int32_t a) { return lim(a, 0xff, 0, Flag::COLOR_G_SAT); }
+inline int32_t limC3(int32_t a) { return lim(a, 0xff, 0, Flag::COLOR_B_SAT); }
+
+template <bool sf>
+inline int32_t limD(int64_t a) { return lim(static_cast<int32_t>(gteShift<sf>(a)), 0xffff, 0, Flag::SZ_SAT); }
+
+inline int32_t limG1(int64_t a) {
+    if (a > 0x3ff) { gteFlag() |= Flag::SX_SAT; return 0x3ff; }
+    if (a < -0x400) { gteFlag() |= Flag::SX_SAT; return -0x400; }
+    return static_cast<int32_t>(a);
+}
+
+inline int32_t limG2(int64_t a) {
+    if (a > 0x3ff) { gteFlag() |= Flag::SY_SAT; return 0x3ff; }
+    if (a < -0x400) { gteFlag() |= Flag::SY_SAT; return -0x400; }
+    return static_cast<int32_t>(a);
+}
+
+inline int32_t limG1ia(int64_t a) { return static_cast<int32_t>(std::clamp<int64_t>(a, -0x4000000, 0x3ffffff)); }
+inline int32_t limG2ia(int64_t a) { return static_cast<int32_t>(std::clamp<int64_t>(a, -0x4000000, 0x3ffffff)); }
+
+inline int32_t limH(int64_t rawMac0) {
+    int64_t valueSf = rawMac0 >> 12;
+    int32_t value12 = static_cast<int32_t>(rawMac0 >> 12);
+    if (valueSf < 0 || valueSf > 0x1000) gteFlag() |= Flag::IR0_SAT;
+    return std::clamp<int32_t>(value12, 0, 0x1000);
+}
+
+// ============================================================================
+// FIFO operations
+// ============================================================================
+
+inline void pushZ(uint16_t z) {
+    sz0() = sz1(); sz1() = sz2(); sz2() = sz3(); sz3() = z;
+}
+
+inline void pushColor() {
+    rgb0() = rgb1(); rgb1() = rgb2();
+    rgb2Cd() = rgbCode();
+    rgb2R() = limC1(mac1() >> 4);
+    rgb2G() = limC2(mac2() >> 4);
+    rgb2B() = limC3(mac3() >> 4);
+}
+
+// ============================================================================
+// Pipeline stage: matrix-vector multiply (fully templatized)
+// ============================================================================
+
+template <bool sf, bool lm, int mx, int v, int cv>
+inline void matrixVectorMultiply(int64_t& rawMac3) {
+    if constexpr (cv == 2) {
+        // FC bug path: columns 1-2 first, then column 0 for FLAG only
+        mac1() = A1<sf>(int44(matrixElement<mx, 0, 1>() * vertexY<v>()) +
+                        matrixElement<mx, 0, 2>() * vertexZ<v>());
+        mac2() = A2<sf>(int44(matrixElement<mx, 1, 1>() * vertexY<v>()) +
+                        matrixElement<mx, 1, 2>() * vertexZ<v>());
+        mac3() = A3<sf>(int44(matrixElement<mx, 2, 1>() * vertexY<v>()) +
+                        matrixElement<mx, 2, 2>() * vertexZ<v>(), rawMac3);
+        // Column 0: FLAG side effects only, results discarded
+        limB1<false>(A1<sf>(int44(controlVector<cv, 0>() << 12) +
+                            matrixElement<mx, 0, 0>() * vertexX<v>()));
+        limB2<false>(A2<sf>(int44(controlVector<cv, 1>() << 12) +
+                            matrixElement<mx, 1, 0>() * vertexX<v>()));
+        limB3<false>(A3<sf>(int44(controlVector<cv, 2>() << 12) +
+                            matrixElement<mx, 2, 0>() * vertexX<v>()));
+    } else {
+        mac1() = A1<sf>(int44(controlVector<cv, 0>() << 12) +
+                        matrixElement<mx, 0, 0>() * vertexX<v>() +
+                        matrixElement<mx, 0, 1>() * vertexY<v>() +
+                        matrixElement<mx, 0, 2>() * vertexZ<v>());
+        mac2() = A2<sf>(int44(controlVector<cv, 1>() << 12) +
+                        matrixElement<mx, 1, 0>() * vertexX<v>() +
+                        matrixElement<mx, 1, 1>() * vertexY<v>() +
+                        matrixElement<mx, 1, 2>() * vertexZ<v>());
+        mac3() = A3<sf>(int44(controlVector<cv, 2>() << 12) +
+                        matrixElement<mx, 2, 0>() * vertexX<v>() +
+                        matrixElement<mx, 2, 1>() * vertexY<v>() +
+                        matrixElement<mx, 2, 2>() * vertexZ<v>(), rawMac3);
+    }
+    ir1() = limB1<lm>(mac1());
+    ir2() = limB2<lm>(mac2());
+    ir3() = limB3<lm>(mac3());
+}
+
+template <bool sf, bool lm, int mx, int v, int cv>
+inline void matrixVectorMultiply() {
+    int64_t unused;
+    matrixVectorMultiply<sf, lm, mx, v, cv>(unused);
+}
+
+// ============================================================================
+// Pipeline stage: light transform - L * V(v) -> MAC/IR
+// ============================================================================
+
+template <bool sf, bool lm, int v>
+inline void lightTransform() {
+    matrixVectorMultiply<sf, lm, 1, v, 3>();
+}
+
+// ============================================================================
+// Pipeline stage: color matrix - BK + C * IR -> MAC/IR
+// ============================================================================
+
+template <bool sf, bool lm>
+inline void colorMatrix() {
+    matrixVectorMultiply<sf, lm, 2, 3, 1>();
+}
+
+// ============================================================================
+// Pipeline stage: depth cue interpolation
+// ============================================================================
+
+template <bool sf, bool lm>
+inline void depthCue(int64_t inR, int64_t inG, int64_t inB) {
+    mac1() = A1<sf>(inR + ir0() * limB1<false>(A1<sf>((rfc() << 12) - inR)));
+    mac2() = A2<sf>(inG + ir0() * limB2<false>(A2<sf>((gfc() << 12) - inG)));
+    int64_t rawMac3;
+    mac3() = A3<sf>(inB + ir0() * limB3<false>(A3<sf>((bfc() << 12) - inB)), rawMac3);
+    ir1() = limB1<lm>(mac1());
+    ir2() = limB2<lm>(mac2());
+    ir3() = limB3<lm>(mac3());
+}
+
+template <bool sf, bool lm>
+inline void depthCueColor() {
+    depthCue<sf, lm>((int64_t)(rgbR() << 4) * ir1(),
+                      (int64_t)(rgbG() << 4) * ir2(),
+                      (int64_t)(rgbB() << 4) * ir3());
+}
+
+// ============================================================================
+// Pipeline stage: color apply - RGBC * IR -> MAC/IR
+// ============================================================================
+
+template <bool sf, bool lm>
+inline void colorApply() {
+    mac1() = A1<sf>((int64_t)(rgbR() << 4) * ir1());
+    mac2() = A2<sf>((int64_t)(rgbG() << 4) * ir2());
+    mac3() = A3<sf>((int64_t)(rgbB() << 4) * ir3());
+    ir1() = limB1<lm>(mac1());
+    ir2() = limB2<lm>(mac2());
+    ir3() = limB3<lm>(mac3());
+}
+
+// ============================================================================
+// Dispatch helpers
+// ============================================================================
+
+inline unsigned sfLmIndex(uint32_t op) {
+    return ((op >> 18) & 2) | ((op >> 10) & 1);
+}
+
+// Generate a 256-entry dispatch table for MVMVA (sf * lm * mx * v * cv).
+// Index layout: [sf:1][lm:1][mx:2][v:2][cv:2]
+template <typename Fn, template <bool, bool, int, int, int> class Impl, size_t... Is>
+constexpr auto makeMvmvaTable(std::index_sequence<Is...>) {
+    return std::array<Fn, sizeof...(Is)>{
+        Impl<bool(Is >> 7), bool((Is >> 6) & 1), int((Is >> 4) & 3), int((Is >> 2) & 3), int(Is & 3)>::fn...};
+}
+
+}  // namespace GTEImpl
+}  // namespace PCSX
diff --git a/src/core/gte-transfer.cc b/src/core/gte-transfer.cc
new file mode 100644
index 000000000..6768c5ed9
--- /dev/null
+++ b/src/core/gte-transfer.cc
@@ -0,0 +1,126 @@
+/***************************************************************************
+ *   Copyright (C) 2026 PCSX-Redux authors                                *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.           *
+ ***************************************************************************/
+
+// GTE COP2 data transfer operations: MFC2, MTC2, CFC2, CTC2, LWC2, SWC2.
+
+#include "core/gte.h"
+#include "core/gte-internal.h"
+#include "core/psxmem.h"
+
+using namespace PCSX::GTEImpl;
+
+uint32_t PCSX::GTE::MFC2(uint32_t code) {
+    return MFC2(static_cast<int>(_Rd_));
+}
+
+uint32_t PCSX::GTE::MFC2(int reg) {
+    auto* d = dataRegs();
+    switch (reg) {
+        case 1: case 3: case 5:
+        case 8: case 9: case 10: case 11:
+            d[reg].d = static_cast<int32_t>(d[reg].sw.l);
+            break;
+        case 7: case 16: case 17: case 18: case 19:
+            d[reg].d = static_cast<uint32_t>(d[reg].w.l);
+            break;
+        case 15:
+            d[reg].d = sxy2();
+            break;
+        case 28: case 29:
+            d[reg].d = lim(ir1() >> 7, 0x1f, 0, 0) |
+                       (lim(ir2() >> 7, 0x1f, 0, 0) << 5) |
+                       (lim(ir3() >> 7, 0x1f, 0, 0) << 10);
+            break;
+    }
+    return d[reg].d;
+}
+
+uint32_t PCSX::GTE::CFC2(uint32_t code) {
+    return ctrlRegs()[_Rd_].d;
+}
+
+void PCSX::GTE::MTC2(uint32_t value, int reg) {
+    auto* d = dataRegs();
+    switch (reg) {
+        case 15:
+            sxy0() = sxy1();
+            sxy1() = sxy2();
+            sxy2() = value;
+            break;
+        case 28:
+            ir1() = (value & 0x1f) << 7;
+            ir2() = (value & 0x3e0) << 2;
+            ir3() = (value & 0x7c00) >> 3;
+            break;
+        case 30:
+            d[31].d = countLeadingBits(value);
+            break;
+        case 31:
+            return;
+    }
+    d[reg].d = value;
+}
+
+void PCSX::GTE::MTC2(uint32_t code) {
+    MTC2(g_emulator->m_cpu->m_regs.GPR.r[_Rt_], _Rd_);
+}
+
+void PCSX::GTE::CTC2(uint32_t value, int reg) {
+    switch (reg) {
+        case 4: case 12: case 20:
+        case 26: case 27: case 29: case 30:
+            value = static_cast<int32_t>(static_cast<int16_t>(value));
+            break;
+        case 31:
+            value = value & 0x7ffff000;
+            if (value & Flag::ERROR_BITS) value |= Flag::GTE_ERROR;
+            break;
+    }
+    ctrlRegs()[reg].d = value;
+}
+
+void PCSX::GTE::CTC2(uint32_t code) {
+    CTC2(g_emulator->m_cpu->m_regs.GPR.r[_Rt_], _Rd_);
+}
+
+void PCSX::GTE::LWC2(uint32_t code) {
+    uint32_t addr = g_emulator->m_cpu->m_regs.GPR.r[_Rs_] + _Imm_;
+    if (addr & 3) {
+        g_emulator->m_cpu->m_regs.pc -= 4;
+        g_system->log(LogClass::CPU, _("Unaligned address 0x%08x in LWC2 from 0x%08x\n"), addr,
+                      g_emulator->m_cpu->m_regs.pc);
+        g_emulator->m_cpu->m_regs.CP0.n.BadVAddr = addr;
+        g_emulator->m_cpu->exception(R3000Acpu::Exception::LoadAddressError, g_emulator->m_cpu->m_inDelaySlot);
+        return;
+    }
+    MTC2(g_emulator->m_mem->read32(addr), _Rt_);
+}
+
+void PCSX::GTE::SWC2(uint32_t code) {
+    uint32_t addr = g_emulator->m_cpu->m_regs.GPR.r[_Rs_] + _Imm_;
+    if (addr & 3) {
+        g_emulator->m_cpu->m_regs.pc -= 4;
+        g_system->log(LogClass::CPU, _("Unaligned address 0x%08x in SWC2 from 0x%08x\n"), addr,
+                      g_emulator->m_cpu->m_regs.pc);
+        g_emulator->m_cpu->m_regs.CP0.n.BadVAddr = addr;
+        g_emulator->m_cpu->exception(R3000Acpu::Exception::StoreAddressError, g_emulator->m_cpu->m_inDelaySlot);
+        return;
+    }
+    g_emulator->m_mem->write32(addr, MFC2(static_cast<int>(_Rt_)));
+}
diff --git a/src/core/gte.cc b/src/core/gte.cc
deleted file mode 100644
index cf9799466..000000000
--- a/src/core/gte.cc
+++ /dev/null
@@ -1,1026 +0,0 @@
-/*
- * PlayStation Geometry Transformation Engine emulator
- *
- * Copyright 2003-2013 smf
- *
- */
-
-#include "core/gte.h"
-
-#include <algorithm>
-
-#include "core/pgxp_debug.h"
-#include "core/pgxp_gte.h"
-#include "core/psxmem.h"
-
-#undef GTE_SF
-#undef GTE_MX
-#undef GTE_V
-#undef GTE_CV
-#undef GTE_LM
-#undef GTE_FUNCT
-
-#undef VX0
-#undef VY0
-#undef VZ0
-#undef VX1
-#undef VY1
-#undef VZ1
-#undef VX2
-#undef VY2
-#undef VZ2
-#undef R
-#undef G
-#undef B
-#undef CODE
-#undef OTZ
-#undef IR0
-#undef IR1
-#undef IR2
-#undef IR3
-#undef SXY0
-#undef SX0
-#undef SY0
-#undef SXY1
-#undef SX1
-#undef SY1
-#undef SXY2
-#undef SX2
-#undef SY2
-#undef SXYP
-#undef SXP
-#undef SYP
-#undef SZ0
-#undef SZ1
-#undef SZ2
-#undef SZ3
-#undef RGB0
-#undef R0
-#undef G0
-#undef B0
-#undef CD0
-#undef RGB1
-#undef R1
-#undef G1
-#undef B1
-#undef CD1
-#undef RGB2
-#undef R2
-#undef G2
-#undef B2
-#undef CD2
-#undef RES1
-#undef MAC0
-#undef MAC1
-#undef MAC2
-#undef MAC3
-#undef IRGB
-#undef ORGB
-#undef LZCS
-#undef LZCR
-
-#undef R11
-#undef R12
-#undef R13
-#undef R21
-#undef R22
-#undef R23
-#undef R31
-#undef R32
-#undef R33
-#undef TRX
-#undef TRY
-#undef TRZ
-#undef L11
-#undef L12
-#undef L13
-#undef L21
-#undef L22
-#undef L23
-#undef L31
-#undef L32
-#undef L33
-#undef RBK
-#undef GBK
-#undef BBK
-#undef LR1
-#undef LR2
-#undef LR3
-#undef LG1
-#undef LG2
-#undef LG3
-#undef LB1
-#undef LB2
-#undef LB3
-#undef RFC
-#undef GFC
-#undef BFC
-#undef OFX
-#undef OFY
-#undef H
-#undef DQA
-#undef DQB
-#undef ZSF3
-#undef ZSF4
-#undef FLAG
-
-#undef VX
-#undef VY
-#undef VZ
-#undef MX11
-#undef MX12
-#undef MX13
-#undef MX21
-#undef MX22
-#undef MX23
-#undef MX31
-#undef MX32
-#undef MX33
-#undef CV1
-#undef CV2
-#undef CV3
-
-#define GTE_SF(op) ((op >> 19) & 1)
-#define GTE_MX(op) ((op >> 17) & 3)
-#define GTE_V(op) ((op >> 15) & 3)
-#define GTE_CV(op) ((op >> 13) & 3)
-#define GTE_LM(op) ((op >> 10) & 1)
-#define GTE_FUNCT(op) (op & 63)
-
-#define VX0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[0].sw.l)
-#define VY0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[0].sw.h)
-#define VZ0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[1].sw.l)
-#define VX1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[2].w.l)
-#define VY1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[2].w.h)
-#define VZ1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[3].w.l)
-#define VX2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[4].w.l)
-#define VY2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[4].w.h)
-#define VZ2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[5].w.l)
-#define R (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[6].b.l)
-#define G (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[6].b.h)
-#define B (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[6].b.h2)
-#define CODE (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[6].b.h3)
-#define OTZ (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[7].w.l)
-#define IR0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[8].sw.l)
-#define IR1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[9].sw.l)
-#define IR2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[10].sw.l)
-#define IR3 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[11].sw.l)
-#define SXY0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[12].d)
-#define SX0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[12].sw.l)
-#define SY0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[12].sw.h)
-#define SXY1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[13].d)
-#define SX1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[13].sw.l)
-#define SY1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[13].sw.h)
-#define SXY2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[14].d)
-#define SX2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[14].sw.l)
-#define SY2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[14].sw.h)
-#define SXYP (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[15].d)
-#define SXP (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[15].sw.l)
-#define SYP (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[15].sw.h)
-#define SZ0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[16].w.l)
-#define SZ1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[17].w.l)
-#define SZ2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[18].w.l)
-#define SZ3 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[19].w.l)
-#define RGB0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[20].d)
-#define R0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[20].b.l)
-#define G0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[20].b.h)
-#define B0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[20].b.h2)
-#define CD0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[20].b.h3)
-#define RGB1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[21].d)
-#define R1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[21].b.l)
-#define G1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[21].b.h)
-#define B1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[21].b.h2)
-#define CD1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[21].b.h3)
-#define RGB2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[22].d)
-#define R2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[22].b.l)
-#define G2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[22].b.h)
-#define B2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[22].b.h2)
-#define CD2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[22].b.h3)
-#define RES1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[23].d)
-#define MAC0 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[24].sd)
-#define MAC1 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[25].sd)
-#define MAC2 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[26].sd)
-#define MAC3 (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[27].sd)
-#define IRGB (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[28].d)
-#define ORGB (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[29].d)
-#define LZCS (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[30].d)
-#define LZCR (PCSX::g_emulator->m_cpu->m_regs.CP2D.p[31].d)
-
-#define R11 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[0].sw.l)
-#define R12 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[0].sw.h)
-#define R13 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[1].sw.l)
-#define R21 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[1].sw.h)
-#define R22 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[2].sw.l)
-#define R23 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[2].sw.h)
-#define R31 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[3].sw.l)
-#define R32 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[3].sw.h)
-#define R33 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[4].sw.l)
-#define TRX (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[5].sd)
-#define TRY (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[6].sd)
-#define TRZ (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[7].sd)
-#define L11 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[8].sw.l)
-#define L12 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[8].sw.h)
-#define L13 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[9].sw.l)
-#define L21 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[9].sw.h)
-#define L22 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[10].sw.l)
-#define L23 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[10].sw.h)
-#define L31 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[11].sw.l)
-#define L32 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[11].sw.h)
-#define L33 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[12].sw.l)
-#define RBK (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[13].sd)
-#define GBK (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[14].sd)
-#define BBK (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[15].sd)
-#define LR1 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[16].sw.l)
-#define LR2 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[16].sw.h)
-#define LR3 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[17].sw.l)
-#define LG1 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[17].sw.h)
-#define LG2 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[18].sw.l)
-#define LG3 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[18].sw.h)
-#define LB1 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[19].sw.l)
-#define LB2 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[19].sw.h)
-#define LB3 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[20].sw.l)
-#define RFC (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[21].sd)
-#define GFC (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[22].sd)
-#define BFC (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[23].sd)
-#define OFX (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[24].sd)
-#define OFY (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[25].sd)
-#define H (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[26].sw.l)
-#define DQA (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[27].sw.l)
-#define DQB (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[28].sd)
-#define ZSF3 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[29].sw.l)
-#define ZSF4 (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[30].sw.l)
-#define FLAG (PCSX::g_emulator->m_cpu->m_regs.CP2C.p[31].d)
-
-#define VX(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2D.p[n << 1].sw.l : IR1)
-#define VY(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2D.p[n << 1].sw.h : IR2)
-#define VZ(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2D.p[(n << 1) + 1].sw.l : IR3)
-#define MX11(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3)].sw.l : -R << 4)
-#define MX12(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3)].sw.h : R << 4)
-#define MX13(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 1].sw.l : IR0)
-#define MX21(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 1].sw.h : R13)
-#define MX22(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 2].sw.l : R13)
-#define MX23(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 2].sw.h : R13)
-#define MX31(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 3].sw.l : R22)
-#define MX32(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 3].sw.h : R22)
-#define MX33(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 4].sw.l : R22)
-#define CV1(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 5].sd : 0)
-#define CV2(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 6].sd : 0)
-#define CV3(n) (n < 3 ? PCSX::g_emulator->m_cpu->m_regs.CP2C.p[(n << 3) + 7].sd : 0)
-
-static int32_t LIM(int32_t value, int32_t max, int32_t min, uint32_t flag) {
-    if (value > max) {
-        FLAG |= flag;
-        return max;
-    } else if (value < min) {
-        FLAG |= flag;
-        return min;
-    }
-
-    return value;
-}
-
-uint32_t PCSX::GTE::MFC2_internal(int reg) {
-    switch (reg) {
-        case 1:
-        case 3:
-        case 5:
-        case 8:
-        case 9:
-        case 10:
-        case 11:
-            PCSX::g_emulator->m_cpu->m_regs.CP2D.p[reg].d = (int32_t)PCSX::g_emulator->m_cpu->m_regs.CP2D.p[reg].sw.l;
-            break;
-
-        case 7:
-        case 16:
-        case 17:
-        case 18:
-        case 19:
-            PCSX::g_emulator->m_cpu->m_regs.CP2D.p[reg].d = (uint32_t)PCSX::g_emulator->m_cpu->m_regs.CP2D.p[reg].w.l;
-            break;
-
-        case 15:
-            PCSX::g_emulator->m_cpu->m_regs.CP2D.p[reg].d = SXY2;
-            break;
-
-        case 28:
-        case 29:
-            PCSX::g_emulator->m_cpu->m_regs.CP2D.p[reg].d =
-                LIM(IR1 >> 7, 0x1f, 0, 0) | (LIM(IR2 >> 7, 0x1f, 0, 0) << 5) | (LIM(IR3 >> 7, 0x1f, 0, 0) << 10);
-            break;
-    }
-
-    return PCSX::g_emulator->m_cpu->m_regs.CP2D.p[reg].d;
-}
-
-void PCSX::GTE::MTC2_internal(uint32_t value, int reg) {
-    switch (reg) {
-        case 15:
-            SXY0 = SXY1;
-            SXY1 = SXY2;
-            SXY2 = value;
-            break;
-
-        case 28:
-            IR1 = (value & 0x1f) << 7;
-            IR2 = (value & 0x3e0) << 2;
-            IR3 = (value & 0x7c00) >> 3;
-            break;
-
-        case 30:
-            LZCR = countLeadingBits(value);
-            break;
-
-        case 31:
-            return;
-    }
-
-    PCSX::g_emulator->m_cpu->m_regs.CP2D.p[reg].d = value;
-}
-
-void PCSX::GTE::CTC2_internal(uint32_t value, int reg) {
-    switch (reg) {
-        case 4:
-        case 12:
-        case 20:
-        case 26:
-        case 27:
-        case 29:
-        case 30:
-            value = (int32_t)(int16_t)value;
-            break;
-
-        case 31:
-            value = value & 0x7ffff000;
-            if ((value & 0x7f87e000) != 0) value |= 0x80000000;
-            break;
-    }
-
-    PCSX::g_emulator->m_cpu->m_regs.CP2C.p[reg].d = value;
-}
-
-// Push a Z value to the Z-coordinate FIFO
-void PCSX::GTE::pushZ(uint16_t z) {
-    SZ0 = SZ1;
-    SZ1 = SZ2;
-    SZ2 = SZ3;
-    SZ3 = z;
-}
-
-// Arithmetic shift right by (sf * 12)
-static inline int64_t gte_shift(int64_t a, int sf) { return sf == 0 ? a : a >> 12; }
-// Shift left by (sf * 12) for GPL
-static inline int64_t gte_shift_GPL(int64_t a, int sf) { return sf == 0 ? a : a << 12; }
-
-int32_t PCSX::GTE::BOUNDS(int44 value, int max_flag, int min_flag) {
-    if (value.positiveOverflow()) FLAG |= max_flag;
-    if (value.negativeOverflow()) FLAG |= min_flag;
-
-    return gte_shift(value.value(), s_sf);
-}
-
-static uint32_t gte_divide(uint16_t numerator, uint16_t denominator) {
-    if (numerator >= denominator * 2) {  // Division overflow
-        FLAG |= (1 << 31) | (1 << 17);
-        return 0x1ffff;
-    }
-
-    static uint8_t table[] = {
-        0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf3, 0xf1, 0xef, 0xee, 0xec, 0xea, 0xe8, 0xe6, 0xe4, 0xe3, 0xe1, 0xdf,
-        0xdd, 0xdc, 0xda, 0xd8, 0xd6, 0xd5, 0xd3, 0xd1, 0xd0, 0xce, 0xcd, 0xcb, 0xc9, 0xc8, 0xc6, 0xc5, 0xc3, 0xc1,
-        0xc0, 0xbe, 0xbd, 0xbb, 0xba, 0xb8, 0xb7, 0xb5, 0xb4, 0xb2, 0xb1, 0xb0, 0xae, 0xad, 0xab, 0xaa, 0xa9, 0xa7,
-        0xa6, 0xa4, 0xa3, 0xa2, 0xa0, 0x9f, 0x9e, 0x9c, 0x9b, 0x9a, 0x99, 0x97, 0x96, 0x95, 0x94, 0x92, 0x91, 0x90,
-        0x8f, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x7f, 0x7e, 0x7d, 0x7c, 0x7b,
-        0x7a, 0x79, 0x78, 0x77, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70, 0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68,
-        0x67, 0x66, 0x65, 0x64, 0x63, 0x62, 0x61, 0x60, 0x5f, 0x5e, 0x5d, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58, 0x57,
-        0x56, 0x55, 0x54, 0x53, 0x53, 0x52, 0x51, 0x50, 0x4f, 0x4e, 0x4d, 0x4d, 0x4c, 0x4b, 0x4a, 0x49, 0x48, 0x48,
-        0x47, 0x46, 0x45, 0x44, 0x43, 0x43, 0x42, 0x41, 0x40, 0x3f, 0x3f, 0x3e, 0x3d, 0x3c, 0x3c, 0x3b, 0x3a, 0x39,
-        0x39, 0x38, 0x37, 0x36, 0x36, 0x35, 0x34, 0x33, 0x33, 0x32, 0x31, 0x31, 0x30, 0x2f, 0x2e, 0x2e, 0x2d, 0x2c,
-        0x2c, 0x2b, 0x2a, 0x2a, 0x29, 0x28, 0x28, 0x27, 0x26, 0x26, 0x25, 0x24, 0x24, 0x23, 0x22, 0x22, 0x21, 0x20,
-        0x20, 0x1f, 0x1e, 0x1e, 0x1d, 0x1d, 0x1c, 0x1b, 0x1b, 0x1a, 0x19, 0x19, 0x18, 0x18, 0x17, 0x16, 0x16, 0x15,
-        0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x0f, 0x0f, 0x0e, 0x0e, 0x0d, 0x0d, 0x0c, 0x0c, 0x0b,
-        0x0a, 0x0a, 0x09, 0x09, 0x08, 0x08, 0x07, 0x07, 0x06, 0x06, 0x05, 0x05, 0x04, 0x04, 0x03, 0x03, 0x02, 0x02,
-        0x01, 0x01, 0x00, 0x00, 0x00};
-
-    int shift = PCSX::GTE::countLeadingZeros16(denominator);
-
-    int r1 = (denominator << shift) & 0x7fff;
-    int r2 = table[((r1 + 0x40) >> 7)] + 0x101;
-    int r3 = ((0x80 - (r2 * (r1 + 0x8000))) >> 8) & 0x1ffff;
-    uint32_t reciprocal = ((r2 * r3) + 0x80) >> 8;
-
-    const uint32_t res = ((((uint64_t)reciprocal * (numerator << shift)) + 0x8000) >> 16);
-
-    // Some divisions like 0xF015/0x780B result in 0x20000, but are saturated to 0x1ffff without setting FLAG
-    return std::min<uint32_t>(0x1ffff, res);
-}
-
-// Setting bits 12 & 19-22 in FLAG does not set bit 31
-
-int32_t PCSX::GTE::A1(int44 a) { return BOUNDS(a, (1 << 31) | (1 << 30), (1 << 31) | (1 << 27)); }
-int32_t PCSX::GTE::A2(int44 a) { return BOUNDS(a, (1 << 31) | (1 << 29), (1 << 31) | (1 << 26)); }
-int32_t PCSX::GTE::A3(int44 a) {
-    s_mac3 = a.value();
-    return BOUNDS(a, (1 << 31) | (1 << 28), (1 << 31) | (1 << 25));
-}
-static int32_t Lm_B1(int32_t a, int lm) { return LIM(a, 0x7fff, -0x8000 * !lm, (1 << 31) | (1 << 24)); }
-static int32_t Lm_B2(int32_t a, int lm) { return LIM(a, 0x7fff, -0x8000 * !lm, (1 << 31) | (1 << 23)); }
-static int32_t Lm_B3(int32_t a, int lm) { return LIM(a, 0x7fff, -0x8000 * !lm, (1 << 22)); }
-
-static int32_t Lm_B3_sf(int64_t value, int sf, int lm) {
-    int32_t value_sf = gte_shift(value, sf);
-    int32_t value_12 = gte_shift(value, 1);
-    constexpr int32_t max = 0x7fff;
-    int32_t min = 0;
-    if (lm == 0) min = -0x8000;
-
-    if (value_12 < -0x8000 || value_12 > 0x7fff) FLAG |= (1 << 22);
-    return std::clamp<int32_t>(value_sf, min, max);
-}
-
-static int32_t Lm_C1(int32_t a) { return LIM(a, 0x00ff, 0x0000, (1 << 21)); }
-static int32_t Lm_C2(int32_t a) { return LIM(a, 0x00ff, 0x0000, (1 << 20)); }
-static int32_t Lm_C3(int32_t a) { return LIM(a, 0x00ff, 0x0000, (1 << 19)); }
-static int32_t Lm_D(int64_t a, int sf) { return LIM(gte_shift(a, sf), 0xffff, 0x0000, (1 << 31) | (1 << 18)); }
-
-int64_t PCSX::GTE::F(int64_t a) {
-    s_mac0 = a;
-
-    if (a > S64(0x7fffffff)) FLAG |= (1 << 31) | (1 << 16);
-
-    if (a < S64(-0x80000000)) FLAG |= (1 << 31) | (1 << 15);
-
-    return a;
-}
-
-static int32_t Lm_G1(int64_t a) {
-    if (a > 0x3ff) {
-        FLAG |= (1 << 31) | (1 << 14);
-        return 0x3ff;
-    }
-    if (a < -0x400) {
-        FLAG |= (1 << 31) | (1 << 14);
-        return -0x400;
-    }
-
-    return a;
-}
-
-static int32_t Lm_G2(int64_t a) {
-    if (a > 0x3ff) {
-        FLAG |= (1 << 31) | (1 << 13);
-        return 0x3ff;
-    }
-
-    if (a < -0x400) {
-        FLAG |= (1 << 31) | (1 << 13);
-        return -0x400;
-    }
-
-    return a;
-}
-
-static int32_t Lm_G1_ia(int64_t a) { return std::clamp<int64_t>(a, -0x4000000, 0x3ffffff); }
-static int32_t Lm_G2_ia(int64_t a) { return std::clamp<int64_t>(a, -0x4000000, 0x3ffffff); }
-
-static int32_t Lm_H(int64_t value, int sf) {
-    int64_t value_sf = gte_shift(value, sf);
-    int32_t value_12 = gte_shift(value, 1);
-    constexpr int32_t max = 0x1000;
-    constexpr int32_t min = 0x0000;
-
-    if (value_sf < min || value_sf > max) FLAG |= (1 << 12);
-    return std::clamp<int32_t>(value_12, min, max);
-}
-
-void PCSX::GTE::RTPS(uint32_t op) {
-    GTE_LOG("%08x GTE: RTPS|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1(int44((int64_t)TRX << 12) + (R11 * VX0) + (R12 * VY0) + (R13 * VZ0));
-    MAC2 = A2(int44((int64_t)TRY << 12) + (R21 * VX0) + (R22 * VY0) + (R23 * VZ0));
-    MAC3 = A3(int44((int64_t)TRZ << 12) + (R31 * VX0) + (R32 * VY0) + (R33 * VZ0));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3_sf(s_mac3, s_sf, lm);
-    pushZ(Lm_D(s_mac3, 1));
-
-    const int32_t h_over_sz3 = gte_divide(H, SZ3);
-    SXY0 = SXY1;
-    SXY1 = SXY2;
-    SX2 =
-        Lm_G1(F((int64_t)OFX + ((int64_t)IR1 * h_over_sz3) * (PCSX::g_emulator->config().Widescreen ? 0.75 : 1)) >> 16);
-
-    SY2 = Lm_G2(F((int64_t)OFY + ((int64_t)IR2 * h_over_sz3)) >> 16);
-
-    PGXP_pushSXYZ2s(
-        Lm_G1_ia((int64_t)OFX + (int64_t)(IR1 * h_over_sz3) * (PCSX::g_emulator->config().Widescreen ? 0.75 : 1)),
-        Lm_G2_ia((int64_t)OFY + (int64_t)(IR2 * h_over_sz3)), std::max((int)SZ3, H / 2), SXY2);
-
-    // PGXP_RTPS(0, SXY2);
-
-    MAC0 = F((int64_t)DQB + ((int64_t)DQA * h_over_sz3));
-    IR0 = Lm_H(s_mac0, 1);
-}
-
-void PCSX::GTE::NCLIP(uint32_t op) {
-    GTE_LOG("%08x GTE: NCLIP|", op);
-    FLAG = 0;
-
-    if (PGXP_NLCIP_valid(SXY0, SXY1, SXY2))
-        MAC0 = F(PGXP_NCLIP());
-    else
-        MAC0 = F((int64_t)(SX0 * SY1) + (SX1 * SY2) + (SX2 * SY0) - (SX0 * SY2) - (SX1 * SY0) - (SX2 * SY1));
-}
-
-void PCSX::GTE::OP(uint32_t op) {
-    GTE_LOG("%08x GTE: OP|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1((int64_t)(R22 * IR3) - (R33 * IR2));
-    MAC2 = A2((int64_t)(R33 * IR1) - (R11 * IR3));
-    MAC3 = A3((int64_t)(R11 * IR2) - (R22 * IR1));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-}
-
-void PCSX::GTE::DPCS(uint32_t op) {
-    GTE_LOG("%08x GTE: DPCS|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1((R << 16) + (IR0 * Lm_B1(A1(((int64_t)RFC << 12) - (R << 16)), 0)));
-    MAC2 = A2((G << 16) + (IR0 * Lm_B2(A2(((int64_t)GFC << 12) - (G << 16)), 0)));
-    MAC3 = A3((B << 16) + (IR0 * Lm_B3(A3(((int64_t)BFC << 12) - (B << 16)), 0)));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::INTPL(uint32_t op) {
-    GTE_LOG("%08x GTE: INTPL|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1((IR1 << 12) + (IR0 * Lm_B1(A1(((int64_t)RFC << 12) - (IR1 << 12)), 0)));
-    MAC2 = A2((IR2 << 12) + (IR0 * Lm_B2(A2(((int64_t)GFC << 12) - (IR2 << 12)), 0)));
-    MAC3 = A3((IR3 << 12) + (IR0 * Lm_B3(A3(((int64_t)BFC << 12) - (IR3 << 12)), 0)));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::MVMVA(uint32_t op) {
-    GTE_LOG("%08x GTE: MVMVA|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    const int mx = GTE_MX(gteop(op));
-    const int v = GTE_V(gteop(op));
-    const int cv = GTE_CV(gteop(op));
-
-    switch (cv) {
-        case 2:
-            MAC1 = A1((int64_t)(MX12(mx) * VY(v)) + (MX13(mx) * VZ(v)));
-            MAC2 = A2((int64_t)(MX22(mx) * VY(v)) + (MX23(mx) * VZ(v)));
-            MAC3 = A3((int64_t)(MX32(mx) * VY(v)) + (MX33(mx) * VZ(v)));
-            Lm_B1(A1(((int64_t)CV1(cv) << 12) + (MX11(mx) * VX(v))), 0);
-            Lm_B2(A2(((int64_t)CV2(cv) << 12) + (MX21(mx) * VX(v))), 0);
-            Lm_B3(A3(((int64_t)CV3(cv) << 12) + (MX31(mx) * VX(v))), 0);
-            break;
-
-        default:
-            MAC1 = A1(int44((int64_t)CV1(cv) << 12) + (MX11(mx) * VX(v)) + (MX12(mx) * VY(v)) + (MX13(mx) * VZ(v)));
-            MAC2 = A2(int44((int64_t)CV2(cv) << 12) + (MX21(mx) * VX(v)) + (MX22(mx) * VY(v)) + (MX23(mx) * VZ(v)));
-            MAC3 = A3(int44((int64_t)CV3(cv) << 12) + (MX31(mx) * VX(v)) + (MX32(mx) * VY(v)) + (MX33(mx) * VZ(v)));
-            break;
-    }
-
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-}
-
-void PCSX::GTE::NCDS(uint32_t op) {
-    GTE_LOG("%08x GTE: NCDS|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1((int64_t)(L11 * VX0) + (L12 * VY0) + (L13 * VZ0));
-    MAC2 = A2((int64_t)(L21 * VX0) + (L22 * VY0) + (L23 * VZ0));
-    MAC3 = A3((int64_t)(L31 * VX0) + (L32 * VY0) + (L33 * VZ0));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    MAC1 = A1(int44((int64_t)RBK << 12) + (LR1 * IR1) + (LR2 * IR2) + (LR3 * IR3));
-    MAC2 = A2(int44((int64_t)GBK << 12) + (LG1 * IR1) + (LG2 * IR2) + (LG3 * IR3));
-    MAC3 = A3(int44((int64_t)BBK << 12) + (LB1 * IR1) + (LB2 * IR2) + (LB3 * IR3));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    MAC1 = A1(((R << 4) * IR1) + (IR0 * Lm_B1(A1(((int64_t)RFC << 12) - ((R << 4) * IR1)), 0)));
-    MAC2 = A2(((G << 4) * IR2) + (IR0 * Lm_B2(A2(((int64_t)GFC << 12) - ((G << 4) * IR2)), 0)));
-    MAC3 = A3(((B << 4) * IR3) + (IR0 * Lm_B3(A3(((int64_t)BFC << 12) - ((B << 4) * IR3)), 0)));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::CDP(uint32_t op) {
-    GTE_LOG("%08x GTE: CDP|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1(int44((int64_t)RBK << 12) + (LR1 * IR1) + (LR2 * IR2) + (LR3 * IR3));
-    MAC2 = A2(int44((int64_t)GBK << 12) + (LG1 * IR1) + (LG2 * IR2) + (LG3 * IR3));
-    MAC3 = A3(int44((int64_t)BBK << 12) + (LB1 * IR1) + (LB2 * IR2) + (LB3 * IR3));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    MAC1 = A1(((R << 4) * IR1) + (IR0 * Lm_B1(A1(((int64_t)RFC << 12) - ((R << 4) * IR1)), 0)));
-    MAC2 = A2(((G << 4) * IR2) + (IR0 * Lm_B2(A2(((int64_t)GFC << 12) - ((G << 4) * IR2)), 0)));
-    MAC3 = A3(((B << 4) * IR3) + (IR0 * Lm_B3(A3(((int64_t)BFC << 12) - ((B << 4) * IR3)), 0)));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::NCDT(uint32_t op) {
-    GTE_LOG("%08x GTE: NCDT|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    for (int v = 0; v < 3; v++) {
-        MAC1 = A1((int64_t)(L11 * VX(v)) + (L12 * VY(v)) + (L13 * VZ(v)));
-        MAC2 = A2((int64_t)(L21 * VX(v)) + (L22 * VY(v)) + (L23 * VZ(v)));
-        MAC3 = A3((int64_t)(L31 * VX(v)) + (L32 * VY(v)) + (L33 * VZ(v)));
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3(MAC3, lm);
-        MAC1 = A1(int44((int64_t)RBK << 12) + (LR1 * IR1) + (LR2 * IR2) + (LR3 * IR3));
-        MAC2 = A2(int44((int64_t)GBK << 12) + (LG1 * IR1) + (LG2 * IR2) + (LG3 * IR3));
-        MAC3 = A3(int44((int64_t)BBK << 12) + (LB1 * IR1) + (LB2 * IR2) + (LB3 * IR3));
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3(MAC3, lm);
-        MAC1 = A1(((R << 4) * IR1) + (IR0 * Lm_B1(A1(((int64_t)RFC << 12) - ((R << 4) * IR1)), 0)));
-        MAC2 = A2(((G << 4) * IR2) + (IR0 * Lm_B2(A2(((int64_t)GFC << 12) - ((G << 4) * IR2)), 0)));
-        MAC3 = A3(((B << 4) * IR3) + (IR0 * Lm_B3(A3(((int64_t)BFC << 12) - ((B << 4) * IR3)), 0)));
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3(MAC3, lm);
-        RGB0 = RGB1;
-        RGB1 = RGB2;
-        CD2 = CODE;
-        R2 = Lm_C1(MAC1 >> 4);
-        G2 = Lm_C2(MAC2 >> 4);
-        B2 = Lm_C3(MAC3 >> 4);
-    }
-}
-
-void PCSX::GTE::NCCS(uint32_t op) {
-    GTE_LOG("%08x GTE: NCCS|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1((int64_t)(L11 * VX0) + (L12 * VY0) + (L13 * VZ0));
-    MAC2 = A2((int64_t)(L21 * VX0) + (L22 * VY0) + (L23 * VZ0));
-    MAC3 = A3((int64_t)(L31 * VX0) + (L32 * VY0) + (L33 * VZ0));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    MAC1 = A1(int44((int64_t)RBK << 12) + (LR1 * IR1) + (LR2 * IR2) + (LR3 * IR3));
-    MAC2 = A2(int44((int64_t)GBK << 12) + (LG1 * IR1) + (LG2 * IR2) + (LG3 * IR3));
-    MAC3 = A3(int44((int64_t)BBK << 12) + (LB1 * IR1) + (LB2 * IR2) + (LB3 * IR3));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    MAC1 = A1((R << 4) * IR1);
-    MAC2 = A2((G << 4) * IR2);
-    MAC3 = A3((B << 4) * IR3);
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::CC(uint32_t op) {
-    GTE_LOG("%08x GTE: CC|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    GTE_LOG("%08x GTE: CC|", op);
-    MAC1 = A1(int44(((int64_t)RBK) << 12) + (LR1 * IR1) + (LR2 * IR2) + (LR3 * IR3));
-    MAC2 = A2(int44(((int64_t)GBK) << 12) + (LG1 * IR1) + (LG2 * IR2) + (LG3 * IR3));
-    MAC3 = A3(int44(((int64_t)BBK) << 12) + (LB1 * IR1) + (LB2 * IR2) + (LB3 * IR3));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    MAC1 = A1((R << 4) * IR1);
-    MAC2 = A2((G << 4) * IR2);
-    MAC3 = A3((B << 4) * IR3);
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::NCS(uint32_t op) {
-    GTE_LOG("%08x GTE: NCS|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1((int64_t)(L11 * VX0) + (L12 * VY0) + (L13 * VZ0));
-    MAC2 = A2((int64_t)(L21 * VX0) + (L22 * VY0) + (L23 * VZ0));
-    MAC3 = A3((int64_t)(L31 * VX0) + (L32 * VY0) + (L33 * VZ0));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    MAC1 = A1(int44((int64_t)RBK << 12) + (LR1 * IR1) + (LR2 * IR2) + (LR3 * IR3));
-    MAC2 = A2(int44((int64_t)GBK << 12) + (LG1 * IR1) + (LG2 * IR2) + (LG3 * IR3));
-    MAC3 = A3(int44((int64_t)BBK << 12) + (LB1 * IR1) + (LB2 * IR2) + (LB3 * IR3));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::NCT(uint32_t op) {
-    GTE_LOG("%08x GTE: NCT|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    for (int v = 0; v < 3; v++) {
-        MAC1 = A1((int64_t)(L11 * VX(v)) + (L12 * VY(v)) + (L13 * VZ(v)));
-        MAC2 = A2((int64_t)(L21 * VX(v)) + (L22 * VY(v)) + (L23 * VZ(v)));
-        MAC3 = A3((int64_t)(L31 * VX(v)) + (L32 * VY(v)) + (L33 * VZ(v)));
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3(MAC3, lm);
-        MAC1 = A1(int44((int64_t)RBK << 12) + (LR1 * IR1) + (LR2 * IR2) + (LR3 * IR3));
-        MAC2 = A2(int44((int64_t)GBK << 12) + (LG1 * IR1) + (LG2 * IR2) + (LG3 * IR3));
-        MAC3 = A3(int44((int64_t)BBK << 12) + (LB1 * IR1) + (LB2 * IR2) + (LB3 * IR3));
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3(MAC3, lm);
-        RGB0 = RGB1;
-        RGB1 = RGB2;
-        CD2 = CODE;
-        R2 = Lm_C1(MAC1 >> 4);
-        G2 = Lm_C2(MAC2 >> 4);
-        B2 = Lm_C3(MAC3 >> 4);
-    }
-}
-
-void PCSX::GTE::SQR(uint32_t op) {
-    GTE_LOG("%08x GTE: SQR|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1(IR1 * IR1);
-    MAC2 = A2(IR2 * IR2);
-    MAC3 = A3(IR3 * IR3);
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-}
-
-void PCSX::GTE::DCPL(uint32_t op) {
-    GTE_LOG("%08x GTE: DCPL|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1(((R << 4) * IR1) + (IR0 * Lm_B1(A1(((int64_t)RFC << 12) - ((R << 4) * IR1)), 0)));
-    MAC2 = A2(((G << 4) * IR2) + (IR0 * Lm_B2(A2(((int64_t)GFC << 12) - ((G << 4) * IR2)), 0)));
-    MAC3 = A3(((B << 4) * IR3) + (IR0 * Lm_B3(A3(((int64_t)BFC << 12) - ((B << 4) * IR3)), 0)));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::DPCT(uint32_t op) {
-    GTE_LOG("%08x GTE: DPCT|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    for (int v = 0; v < 3; v++) {
-        MAC1 = A1((R0 << 16) + (IR0 * Lm_B1(A1(((int64_t)RFC << 12) - (R0 << 16)), 0)));
-        MAC2 = A2((G0 << 16) + (IR0 * Lm_B2(A2(((int64_t)GFC << 12) - (G0 << 16)), 0)));
-        MAC3 = A3((B0 << 16) + (IR0 * Lm_B3(A3(((int64_t)BFC << 12) - (B0 << 16)), 0)));
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3(MAC3, lm);
-        RGB0 = RGB1;
-        RGB1 = RGB2;
-        CD2 = CODE;
-        R2 = Lm_C1(MAC1 >> 4);
-        G2 = Lm_C2(MAC2 >> 4);
-        B2 = Lm_C3(MAC3 >> 4);
-    }
-}
-
-void PCSX::GTE::AVSZ3(uint32_t op) {
-    GTE_LOG("%08x GTE: AVSZ3|", op);
-    FLAG = 0;
-
-    MAC0 = F((int64_t)(ZSF3 * SZ1) + (ZSF3 * SZ2) + (ZSF3 * SZ3));
-    OTZ = Lm_D(s_mac0, 1);
-}
-
-void PCSX::GTE::AVSZ4(uint32_t op) {
-    GTE_LOG("%08x GTE: AVSZ4|", op);
-    FLAG = 0;
-
-    MAC0 = F((int64_t)(ZSF4 * SZ0) + (ZSF4 * SZ1) + (ZSF4 * SZ2) + (ZSF4 * SZ3));
-    OTZ = Lm_D(s_mac0, 1);
-}
-
-void PCSX::GTE::RTPT(uint32_t op) {
-    GTE_LOG("%08x GTE: RTPT|", op);
-
-    int32_t h_over_sz3;
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    for (int v = 0; v < 3; v++) {
-        MAC1 = A1(int44((int64_t)TRX << 12) + (R11 * VX(v)) + (R12 * VY(v)) + (R13 * VZ(v)));
-        MAC2 = A2(int44((int64_t)TRY << 12) + (R21 * VX(v)) + (R22 * VY(v)) + (R23 * VZ(v)));
-        MAC3 = A3(int44((int64_t)TRZ << 12) + (R31 * VX(v)) + (R32 * VY(v)) + (R33 * VZ(v)));
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3_sf(s_mac3, s_sf, lm);
-        pushZ(Lm_D(s_mac3, 1));
-
-        h_over_sz3 = gte_divide(H, SZ3);
-        SXY0 = SXY1;
-        SXY1 = SXY2;
-        SX2 = Lm_G1(
-            F((int64_t)OFX + ((int64_t)IR1 * h_over_sz3) * (PCSX::g_emulator->config().Widescreen ? 0.75 : 1)) >> 16);
-        SY2 = Lm_G2(F((int64_t)OFY + ((int64_t)IR2 * h_over_sz3)) >> 16);
-
-        PGXP_pushSXYZ2s(
-            Lm_G1_ia((int64_t)OFX + (int64_t)(IR1 * h_over_sz3) * (PCSX::g_emulator->config().Widescreen ? 0.75 : 1)),
-            Lm_G2_ia((int64_t)OFY + (int64_t)(IR2 * h_over_sz3)), std::max((int)SZ3, H / 2), SXY2);
-
-        // PGXP_RTPS(v, SXY2);
-    }
-
-    MAC0 = F((int64_t)DQB + ((int64_t)DQA * h_over_sz3));
-    IR0 = Lm_H(s_mac0, 1);
-}
-
-void PCSX::GTE::GPL(uint32_t op) {
-    GTE_LOG("%08x GTE: GPL|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1(gte_shift_GPL(MAC1, s_sf) + (IR0 * IR1));
-    MAC2 = A2(gte_shift_GPL(MAC2, s_sf) + (IR0 * IR2));
-    MAC3 = A3(gte_shift_GPL(MAC3, s_sf) + (IR0 * IR3));
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::GPF(uint32_t op) {
-    GTE_LOG("%08x GTE: GPF|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    MAC1 = A1(IR0 * IR1);
-    MAC2 = A2(IR0 * IR2);
-    MAC3 = A3(IR0 * IR3);
-    IR1 = Lm_B1(MAC1, lm);
-    IR2 = Lm_B2(MAC2, lm);
-    IR3 = Lm_B3(MAC3, lm);
-    RGB0 = RGB1;
-    RGB1 = RGB2;
-    CD2 = CODE;
-    R2 = Lm_C1(MAC1 >> 4);
-    G2 = Lm_C2(MAC2 >> 4);
-    B2 = Lm_C3(MAC3 >> 4);
-}
-
-void PCSX::GTE::NCCT(uint32_t op) {
-    GTE_LOG("%08x GTE: NCCT|", op);
-
-    const int lm = GTE_LM(gteop(op));
-    s_sf = GTE_SF(gteop(op));
-    FLAG = 0;
-
-    for (int v = 0; v < 3; v++) {
-        MAC1 = A1((int64_t)(L11 * VX(v)) + (L12 * VY(v)) + (L13 * VZ(v)));
-        MAC2 = A2((int64_t)(L21 * VX(v)) + (L22 * VY(v)) + (L23 * VZ(v)));
-        MAC3 = A3((int64_t)(L31 * VX(v)) + (L32 * VY(v)) + (L33 * VZ(v)));
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3(MAC3, lm);
-        MAC1 = A1(int44((int64_t)RBK << 12) + (LR1 * IR1) + (LR2 * IR2) + (LR3 * IR3));
-        MAC2 = A2(int44((int64_t)GBK << 12) + (LG1 * IR1) + (LG2 * IR2) + (LG3 * IR3));
-        MAC3 = A3(int44((int64_t)BBK << 12) + (LB1 * IR1) + (LB2 * IR2) + (LB3 * IR3));
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3(MAC3, lm);
-        MAC1 = A1((R << 4) * IR1);
-        MAC2 = A2((G << 4) * IR2);
-        MAC3 = A3((B << 4) * IR3);
-        IR1 = Lm_B1(MAC1, lm);
-        IR2 = Lm_B2(MAC2, lm);
-        IR3 = Lm_B3(MAC3, lm);
-        RGB0 = RGB1;
-        RGB1 = RGB2;
-        CD2 = CODE;
-        R2 = Lm_C1(MAC1 >> 4);
-        G2 = Lm_C2(MAC2 >> 4);
-        B2 = Lm_C3(MAC3 >> 4);
-    }
-}
diff --git a/src/core/gte.h b/src/core/gte.h
index 41f26707a..e5b6d1612 100644
--- a/src/core/gte.h
+++ b/src/core/gte.h
@@ -1,5 +1,5 @@
 /***************************************************************************
- *   Copyright (C) 2007 Ryan Schultz, PCSX-df Team, PCSX team              *
+ *   Copyright (C) 2026 PCSX-Redux authors                                *
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
@@ -18,64 +18,32 @@
  ***************************************************************************/
 
 #pragma once
+
 #include <bit>
+#include <cstdint>
 
 #include "core/psxemulator.h"
 #include "core/r3000a.h"
 
-// WTF termios
+// termios defines NCCS which collides with our method name
 #undef NCCS
 
-#define gteoB (PCSX::g_emulator->m_cpu->m_regs.GPR.r[_Rs_] + _Imm_)
-#define gteop(instruction) ((instruction) & 0x1ffffff)
-
 namespace PCSX {
 
 class GTE {
   public:
-    uint32_t MFC2(uint32_t code) {
-        // CPU[Rt] = GTE_D[Rd]
-        return MFC2_internal(_Rd_);
-    }
-
-    uint32_t MFC2(int reg) { return MFC2_internal(reg); }
-
-    uint32_t CFC2(uint32_t code) {
-        // CPU[Rt] = GTE_C[Rd]
-        return PCSX::g_emulator->m_cpu->m_regs.CP2C.p[_Rd_].d;
-    }
-
-    void CTC2(uint32_t value, int reg) { CTC2_internal(value, reg); }
-
-    void MTC2(uint32_t value, int reg) { MTC2_internal(value, reg); }
-
-    void MTC2(uint32_t code) { MTC2_internal(PCSX::g_emulator->m_cpu->m_regs.GPR.r[_Rt_], _Rd_); }
-    void CTC2(uint32_t code) { CTC2_internal(PCSX::g_emulator->m_cpu->m_regs.GPR.r[_Rt_], _Rd_); }
-    void LWC2(uint32_t code) {
-        if (gteoB & 3) {
-            PCSX::g_emulator->m_cpu->m_regs.pc -= 4;
-            PCSX::g_system->log(PCSX::LogClass::CPU, _("Unaligned address 0x%08x in LWC2 from 0x%08x\n"), gteoB,
-                                PCSX::g_emulator->m_cpu->m_regs.pc);
-            PCSX::g_emulator->m_cpu->m_regs.CP0.n.BadVAddr = gteoB;
-            PCSX::g_emulator->m_cpu->exception(PCSX::R3000Acpu::Exception::LoadAddressError,
-                                               PCSX::g_emulator->m_cpu->m_inDelaySlot);
-            return;
-        }
-        MTC2_internal(PCSX::g_emulator->m_mem->read32(gteoB), _Rt_);
-    }
-    void SWC2(uint32_t code) {
-        if (gteoB & 3) {
-            PCSX::g_emulator->m_cpu->m_regs.pc -= 4;
-            PCSX::g_system->log(PCSX::LogClass::CPU, _("Unaligned address 0x%08x in SWC2 from 0x%08x\n"), gteoB,
-                                PCSX::g_emulator->m_cpu->m_regs.pc);
-            PCSX::g_emulator->m_cpu->m_regs.CP0.n.BadVAddr = gteoB;
-            PCSX::g_emulator->m_cpu->exception(PCSX::R3000Acpu::Exception::StoreAddressError,
-                                               PCSX::g_emulator->m_cpu->m_inDelaySlot);
-            return;
-        }
-        PCSX::g_emulator->m_mem->write32(gteoB, MFC2_internal(_Rt_));
-    }
-
+    // COP2 data transfer operations
+    uint32_t MFC2(uint32_t code);
+    uint32_t MFC2(int reg);
+    uint32_t CFC2(uint32_t code);
+    void MTC2(uint32_t value, int reg);
+    void MTC2(uint32_t code);
+    void CTC2(uint32_t value, int reg);
+    void CTC2(uint32_t code);
+    void LWC2(uint32_t code);
+    void SWC2(uint32_t code);
+
+    // GTE function instructions (COP2 imm25)
     void RTPS(uint32_t code);
     void NCLIP(uint32_t code);
     void OP(uint32_t code);
@@ -99,61 +67,31 @@ class GTE {
     void GPL(uint32_t code);
     void NCCT(uint32_t code);
 
-    // If MSB is set, return the number of leading ones, else return the number of leading zeroes
-    // For an input of 0, 32 is returned
+    // Count leading redundant sign bits. For positive: leading zeros. For negative: leading ones.
+    // Returns 32 for input of 0 or 0xffffffff.
     static uint32_t countLeadingBits(uint32_t value) {
-        if (value & 0x80000000) {
-            value = ~value;
-        }
+        if (value & 0x80000000) value = ~value;
         return std::countl_zero<uint32_t>(value);
     }
 
-    // Count leading zeroes of a 16-bit value. For an input of 0, 16 is returned
+    // Count leading zeros of a 16-bit value. Returns 16 for input of 0.
     static uint32_t countLeadingZeros16(uint16_t value) {
-        // Use a 32-bit CLZ as it's what's most commonly available and Clang/GCC fail to optimize 16-bit CLZ
-        const auto count = std::countl_zero<uint32_t>((uint32_t)value);
-        return count - 16;
+        return std::countl_zero<uint32_t>(static_cast<uint32_t>(value)) - 16;
     }
 
   private:
-    class int44 {
-      public:
-        int44(int64_t value)
-            : m_value(value), m_positive_overflow(value > 0x7ffffffffff), m_negative_overflow(value < -0x80000000000) {}
-
-        int44(int64_t value, bool positive_overflow, bool negative_overflow)
-            : m_value(value), m_positive_overflow(positive_overflow), m_negative_overflow(negative_overflow) {}
-
-        int44 operator+(int64_t rhs) {
-            int64_t value = ((m_value + rhs) << 20) >> 20;
-            return int44(value, m_positive_overflow || (value < 0 && m_value >= 0 && rhs >= 0),
-                         m_negative_overflow || (value >= 0 && m_value < 0 && rhs < 0));
-        }
-
-        bool positiveOverflow() { return m_positive_overflow; }
-        bool negativeOverflow() { return m_negative_overflow; }
-        int64_t value() { return m_value; }
-
-      private:
-        int64_t m_value;
-        bool m_positive_overflow;
-        bool m_negative_overflow;
-    };
-
-    int s_sf;
-    int64_t s_mac0;
-    int64_t s_mac3;
-
-    int32_t BOUNDS(int44 value, int max_flag, int min_flag);
-    int32_t A1(int44 a);
-    int32_t A2(int44 a);
-    int32_t A3(int44 a);
-    int64_t F(int64_t a);
-
-    uint32_t MFC2_internal(int reg);
-    void MTC2_internal(uint32_t value, int reg);
-    void CTC2_internal(uint32_t value, int reg);
-    void pushZ(uint16_t z);
+    // Template instruction implementations, parameterized on sf (shift factor) and lm (limit mode).
+    // Defined in gte-instructions.cc. The public methods dispatch to these based on the encoding.
+    template <bool sf, bool lm> void op(uint32_t op);
+    template <bool sf, bool lm> void dpcs(uint32_t op);
+    template <bool sf, bool lm> void intpl(uint32_t op);
+    template <bool sf, bool lm> void cdp(uint32_t op);
+    template <bool sf, bool lm> void cc(uint32_t op);
+    template <bool sf, bool lm> void sqr(uint32_t op);
+    template <bool sf, bool lm> void dcpl(uint32_t op);
+    template <bool sf, bool lm> void dpct(uint32_t op);
+    template <bool sf, bool lm> void gpf(uint32_t op);
+    template <bool sf, bool lm> void gpl(uint32_t op);
 };
 
 }  // namespace PCSX
diff --git a/src/mips/common/hardware/cop2.h b/src/mips/common/hardware/cop2.h
new file mode 100644
index 000000000..e3f430068
--- /dev/null
+++ b/src/mips/common/hardware/cop2.h
@@ -0,0 +1,310 @@
+/*
+
+MIT License
+
+Copyright (c) 2026 PCSX-Redux authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#pragma once
+
+// COP2 (GTE) instruction encoder and register access helpers.
+//
+// GTE command encoding (25-bit immediate for cop2 instruction):
+//
+//   24      20  19  18-17  16-15  14-13  12-11  10  9-6  5-0
+//   [fake ] [pad][sf][ mx ][ v  ][ cv  ][ pad ][lm][pad][cmd]
+//
+//   sf:  shift flag (0 = no shift, 1 = shift right 12)
+//   mx:  matrix select (0=RT, 1=LL, 2=LC, 3=garbage)
+//   v:   vector select (0=V0, 1=V1, 2=V2, 3=IR)
+//   cv:  control vector select (0=TR, 1=BK, 2=FC/bugged, 3=zero)
+//   lm:  limit flag (0=clamp -0x8000..0x7fff, 1=clamp 0..0x7fff)
+//   cmd: function code (6 bits)
+//
+// The upper bits (20-24) contain a "fake" opcode number that Sony's
+// documentation uses for instruction naming. Hardware ignores these
+// bits for dispatch - only the 6-bit function code matters.
+
+#include <stdint.h>
+
+// ==========================================================================
+// Bitfield encoding
+// ==========================================================================
+
+#define COP2_SF_SHIFT  19
+#define COP2_MX_SHIFT  17
+#define COP2_V_SHIFT   15
+#define COP2_CV_SHIFT  13
+#define COP2_LM_SHIFT  10
+
+// Shift factor
+#define COP2_SF0  0  // No shift
+#define COP2_SF1  1  // Shift right 12
+
+// Matrix select
+#define COP2_MX_RT  0  // Rotation matrix
+#define COP2_MX_LL  1  // Light matrix
+#define COP2_MX_LC  2  // Light color matrix
+#define COP2_MX_BAD 3  // Garbage matrix (undocumented)
+
+// Vector select
+#define COP2_V_V0  0
+#define COP2_V_V1  1
+#define COP2_V_V2  2
+#define COP2_V_IR  3  // IR1/IR2/IR3
+
+// Control vector select
+#define COP2_CV_TR   0  // Translation vector
+#define COP2_CV_BK   1  // Background color
+#define COP2_CV_FC   2  // Far color (bugged)
+#define COP2_CV_NONE 3  // Zero / no translation
+
+// Limit mode
+#define COP2_LM_SIGNED   0  // Clamp IR to [-0x8000, 0x7FFF]
+#define COP2_LM_UNSIGNED 1  // Clamp IR to [0, 0x7FFF]
+
+// Function codes (bits 5-0)
+#define COP2_FN_RTPS   0x01
+#define COP2_FN_NCLIP  0x06
+#define COP2_FN_OP     0x0c
+#define COP2_FN_DPCS   0x10
+#define COP2_FN_INTPL  0x11
+#define COP2_FN_MVMVA  0x12
+#define COP2_FN_NCDS   0x13
+#define COP2_FN_CDP    0x14
+#define COP2_FN_NCDT   0x16
+#define COP2_FN_NCCS   0x1b
+#define COP2_FN_CC     0x1c
+#define COP2_FN_NCS    0x1e
+#define COP2_FN_NCT    0x20
+#define COP2_FN_SQR    0x28
+#define COP2_FN_DCPL   0x29
+#define COP2_FN_DPCT   0x2a
+#define COP2_FN_AVSZ3  0x2d
+#define COP2_FN_AVSZ4  0x2e
+#define COP2_FN_RTPT   0x30
+#define COP2_FN_GPF    0x3d
+#define COP2_FN_GPL    0x3e
+#define COP2_FN_NCCT   0x3f
+
+// ==========================================================================
+// Generic encoder: build a cop2 opcode from individual fields
+// ==========================================================================
+
+// Generic encoder: build a cop2 opcode from individual fields.
+// The fake field (bits 24-20) is Sony's instruction number. Hardware
+// ignores it, but conventional encodings include it.
+#define COP2_OP(fake, sf, mx, v, cv, lm, fn) \
+    (((fake) << 20) | ((sf) << COP2_SF_SHIFT) | ((mx) << COP2_MX_SHIFT) | \
+     ((v) << COP2_V_SHIFT) | ((cv) << COP2_CV_SHIFT) | \
+     ((lm) << COP2_LM_SHIFT) | (fn))
+
+// ==========================================================================
+// Named instruction encoders
+// ==========================================================================
+// Each macro embeds the conventional fake field value from Sony's docs.
+// The sf and lm parameters are user-selectable. Other fields (mx, v, cv)
+// are fixed per instruction - only MVMVA exposes them.
+
+// Perspective transform (single / triple)
+#define COP2_RTPS(sf, lm)   COP2_OP( 1, sf, 0, 0, 0, lm, COP2_FN_RTPS)
+#define COP2_RTPT(sf, lm)   COP2_OP( 2, sf, 0, 0, 0, lm, COP2_FN_RTPT)
+
+// Normal clipping
+#define COP2_NCLIP           COP2_OP(20, 0, 0, 0, 0, 0, COP2_FN_NCLIP)
+
+// Cross product (rotation diagonal x IR)
+#define COP2_OP_CP(sf, lm)  COP2_OP(23, sf, 0, 0, 0, lm, COP2_FN_OP)
+
+// Depth cue
+#define COP2_DPCS(sf, lm)   COP2_OP( 7, sf, 0, 0, 0, lm, COP2_FN_DPCS)
+#define COP2_DPCT(sf, lm)   COP2_OP(15, sf, 0, 0, 0, lm, COP2_FN_DPCT)
+#define COP2_DCPL(sf, lm)   COP2_OP( 6, sf, 0, 0, 0, lm, COP2_FN_DCPL)
+#define COP2_INTPL(sf, lm)  COP2_OP( 9, sf, 0, 0, 0, lm, COP2_FN_INTPL)
+
+// Matrix-vector multiply and add (fully parameterized)
+#define COP2_MVMVA(sf, mx, v, cv, lm) \
+    COP2_OP(4, sf, mx, v, cv, lm, COP2_FN_MVMVA)
+
+// Lighting: normal color (single / triple)
+#define COP2_NCS(sf, lm)    COP2_OP(12, sf, 0, 0, 0, lm, COP2_FN_NCS)
+#define COP2_NCT(sf, lm)    COP2_OP(13, sf, 0, 0, 0, lm, COP2_FN_NCT)
+#define COP2_NCCS(sf, lm)   COP2_OP(16, sf, 0, 0, 0, lm, COP2_FN_NCCS)
+#define COP2_NCCT(sf, lm)   COP2_OP(17, sf, 0, 0, 0, lm, COP2_FN_NCCT)
+#define COP2_NCDS(sf, lm)   COP2_OP(14, sf, 0, 0, 0, lm, COP2_FN_NCDS)
+#define COP2_NCDT(sf, lm)   COP2_OP(15, sf, 0, 0, 0, lm, COP2_FN_NCDT)
+
+// Color
+#define COP2_CC(sf, lm)     COP2_OP(19, sf, 0, 0, 0, lm, COP2_FN_CC)
+#define COP2_CDP(sf, lm)    COP2_OP(18, sf, 0, 0, 0, lm, COP2_FN_CDP)
+
+// Square
+#define COP2_SQR(sf, lm)    COP2_OP(10, sf, 0, 0, 0, lm, COP2_FN_SQR)
+
+// Average Z
+#define COP2_AVSZ3           COP2_OP(21, 1, 0, 0, 0, 0, COP2_FN_AVSZ3)
+#define COP2_AVSZ4           COP2_OP(22, 1, 0, 0, 0, 0, COP2_FN_AVSZ4)
+
+// General purpose interpolation
+#define COP2_GPF(sf, lm)    COP2_OP(25, sf, 0, 0, 0, lm, COP2_FN_GPF)
+#define COP2_GPL(sf, lm)    COP2_OP(26, sf, 0, 0, 0, lm, COP2_FN_GPL)
+
+// ==========================================================================
+// Execution macro
+// ==========================================================================
+
+#define cop2_cmd(op) __asm__ volatile("cop2 %0" : : "i"(op))
+
+// ==========================================================================
+// Register access
+// ==========================================================================
+
+// GTE data registers (MTC2/MFC2, $0-$31)
+#define cop2_put(reg, val) do {             \
+    uint32_t _v = (val);                    \
+    __asm__ volatile("mtc2 %0, $" #reg      \
+                     "\n\tnop\n\tnop"        \
+                     : : "r"(_v));          \
+} while (0)
+
+#define cop2_get(reg, dest) do {            \
+    __asm__ volatile("mfc2 %0, $" #reg      \
+                     "\n\tnop\n\tnop"        \
+                     : "=r"(dest));          \
+} while (0)
+
+// GTE control registers (CTC2/CFC2, $0-$31)
+#define cop2_putc(reg, val) do {            \
+    uint32_t _v = (val);                    \
+    __asm__ volatile("ctc2 %0, $" #reg      \
+                     "\n\tnop\n\tnop"        \
+                     : : "r"(_v));          \
+} while (0)
+
+#define cop2_getc(reg, dest) do {           \
+    __asm__ volatile("cfc2 %0, $" #reg      \
+                     "\n\tnop\n\tnop"        \
+                     : "=r"(dest));          \
+} while (0)
+
+// ==========================================================================
+// Data register indices
+// ==========================================================================
+
+#define COP2_VXY0   0   // VX0 (low16), VY0 (high16)
+#define COP2_VZ0    1
+#define COP2_VXY1   2
+#define COP2_VZ1    3
+#define COP2_VXY2   4
+#define COP2_VZ2    5
+#define COP2_RGBC   6   // R (low8), G, B, CODE (high8)
+#define COP2_OTZ    7   // 16-bit unsigned, zero-extended on read
+#define COP2_IR0    8   // 16-bit signed, sign-extended on read
+#define COP2_IR1    9
+#define COP2_IR2   10
+#define COP2_IR3   11
+#define COP2_SXY0  12
+#define COP2_SXY1  13
+#define COP2_SXY2  14
+#define COP2_SXYP  15  // Write pushes SXY FIFO, read returns SXY2
+#define COP2_SZ0   16  // 16-bit unsigned, zero-extended on read
+#define COP2_SZ1   17
+#define COP2_SZ2   18
+#define COP2_SZ3   19
+#define COP2_RGB0  20  // Color FIFO entry 0 (oldest)
+#define COP2_RGB1  21
+#define COP2_RGB2  22  // Color FIFO entry 2 (newest, written by instructions)
+#define COP2_RES1  23  // Reserved (but read/write works)
+#define COP2_MAC0  24  // 32-bit signed
+#define COP2_MAC1  25
+#define COP2_MAC2  26
+#define COP2_MAC3  27
+#define COP2_IRGB  28  // Write expands 5-bit fields to IR1-3. Read packs IR1-3.
+#define COP2_ORGB  29  // Read-only: packs IR1-3 with saturation
+#define COP2_LZCS  30  // Write triggers LZCR computation
+#define COP2_LZCR  31  // Read-only: leading bit count result
+
+// ==========================================================================
+// Control register indices
+// ==========================================================================
+
+#define COP2_R11R12   0
+#define COP2_R13R21   1
+#define COP2_R22R23   2
+#define COP2_R31R32   3
+#define COP2_R33      4   // 16-bit, sign-extended on read/write
+#define COP2_TRX      5   // 32-bit
+#define COP2_TRY      6
+#define COP2_TRZ      7
+#define COP2_L11L12   8
+#define COP2_L13L21   9
+#define COP2_L22L23  10
+#define COP2_L31L32  11
+#define COP2_L33     12   // 16-bit, sign-extended
+#define COP2_RBK     13   // 32-bit
+#define COP2_GBK     14
+#define COP2_BBK     15
+#define COP2_LR1LR2  16
+#define COP2_LR3LG1  17
+#define COP2_LG2LG3  18
+#define COP2_LB1LB2  19
+#define COP2_LB3     20   // 16-bit, sign-extended
+#define COP2_RFC     21   // 32-bit
+#define COP2_GFC     22
+#define COP2_BFC     23
+#define COP2_OFX     24   // 32-bit (16.16 fixed)
+#define COP2_OFY     25
+#define COP2_H       26   // 16-bit unsigned (but sign-extends on CFC2 read)
+#define COP2_DQA     27   // 16-bit, sign-extended
+#define COP2_DQB     28   // 32-bit
+#define COP2_ZSF3    29   // 16-bit, sign-extended
+#define COP2_ZSF4    30   // 16-bit, sign-extended
+#define COP2_FLAG    31   // FLAG register (write mask 0x7FFFF000, bit 31 recomputed)
+
+// ==========================================================================
+// FLAG register bit definitions
+// ==========================================================================
+
+#define COP2_FLAG_MAC1_OVER_POS  (1u << 30)  // MAC1 result > +0x7FFFFFFFFFF
+#define COP2_FLAG_MAC2_OVER_POS  (1u << 29)
+#define COP2_FLAG_MAC3_OVER_POS  (1u << 28)
+#define COP2_FLAG_MAC1_OVER_NEG  (1u << 27)  // MAC1 result < -0x80000000000
+#define COP2_FLAG_MAC2_OVER_NEG  (1u << 26)
+#define COP2_FLAG_MAC3_OVER_NEG  (1u << 25)
+#define COP2_FLAG_IR1_SAT        (1u << 24)  // IR1 saturated (sets summary)
+#define COP2_FLAG_IR2_SAT        (1u << 23)  // IR2 saturated (sets summary)
+#define COP2_FLAG_IR3_SAT        (1u << 22)  // IR3 saturated (NO summary)
+#define COP2_FLAG_COLOR_R_SAT    (1u << 21)  // Color R saturated to [0,255] (NO summary)
+#define COP2_FLAG_COLOR_G_SAT    (1u << 20)  // Color G saturated (NO summary)
+#define COP2_FLAG_COLOR_B_SAT    (1u << 19)  // Color B saturated (NO summary)
+#define COP2_FLAG_SZ3_OTZ_SAT   (1u << 18)  // SZ3/OTZ saturated to [0,0xFFFF] (sets summary)
+#define COP2_FLAG_DIV_OVERFLOW   (1u << 17)  // Division overflow H >= 2*SZ3 (sets summary)
+#define COP2_FLAG_MAC0_OVER_POS  (1u << 16)  // MAC0 > 0x7FFFFFFF (sets summary)
+#define COP2_FLAG_MAC0_OVER_NEG  (1u << 15)  // MAC0 < -0x80000000 (sets summary)
+#define COP2_FLAG_SX2_SAT        (1u << 14)  // SX2 saturated to [-0x400,0x3FF] (sets summary)
+#define COP2_FLAG_SY2_SAT        (1u << 13)  // SY2 saturated (sets summary)
+#define COP2_FLAG_IR0_SAT        (1u << 12)  // IR0 saturated to [0,0x1000] (NO summary)
+#define COP2_FLAG_ERROR          (1u << 31)  // Error summary (OR of bits that set summary)
+
+// Bits that set the error summary (bit 31):
+// 30-23 (MAC overflow, IR1/IR2 sat) and 18-13 (SZ3, div, MAC0, SX2, SY2)
+// Bits that do NOT set summary: 22 (IR3), 21-19 (color RGB), 12 (IR0)
diff --git a/src/mips/tests/Makefile b/src/mips/tests/Makefile
index 83fd8656a..c62ecf502 100644
--- a/src/mips/tests/Makefile
+++ b/src/mips/tests/Makefile
@@ -3,6 +3,7 @@ all:
 	$(MAKE) -C cpu all
 	$(MAKE) -C cop0 all
 	$(MAKE) -C dma all
+	$(MAKE) -C gte all
 	$(MAKE) -C libc all
 	$(MAKE) -C memcpy all
 	$(MAKE) -C memset all
@@ -14,6 +15,7 @@ clean:
 	$(MAKE) -C cpu clean
 	$(MAKE) -C cop0 clean
 	$(MAKE) -C dma clean
+	$(MAKE) -C gte clean
 	$(MAKE) -C libc clean
 	$(MAKE) -C memcpy clean
 	$(MAKE) -C memset clean
diff --git a/src/mips/tests/gte/Makefile b/src/mips/tests/gte/Makefile
new file mode 100644
index 000000000..79c970cf1
--- /dev/null
+++ b/src/mips/tests/gte/Makefile
@@ -0,0 +1,50 @@
+TARGET = gte
+USE_FUNCTION_SECTIONS = false
+TYPE = ps-exe
+
+SRCS = \
+../uC-sdk-glue/BoardConsole.c \
+../uC-sdk-glue/BoardInit.c \
+../uC-sdk-glue/init.c \
+\
+../../../../third_party/uC-sdk/libc/src/cxx-glue.c \
+../../../../third_party/uC-sdk/libc/src/errno.c \
+../../../../third_party/uC-sdk/libc/src/initfini.c \
+../../../../third_party/uC-sdk/libc/src/malloc.c \
+../../../../third_party/uC-sdk/libc/src/qsort.c \
+../../../../third_party/uC-sdk/libc/src/rand.c \
+../../../../third_party/uC-sdk/libc/src/reent.c \
+../../../../third_party/uC-sdk/libc/src/stdio.c \
+../../../../third_party/uC-sdk/libc/src/string.c \
+../../../../third_party/uC-sdk/libc/src/strto.c \
+../../../../third_party/uC-sdk/libc/src/unistd.c \
+../../../../third_party/uC-sdk/libc/src/xprintf.c \
+../../../../third_party/uC-sdk/libc/src/xscanf.c \
+../../../../third_party/uC-sdk/libc/src/yscanf.c \
+../../../../third_party/uC-sdk/os/src/devfs.c \
+../../../../third_party/uC-sdk/os/src/filesystem.c \
+../../../../third_party/uC-sdk/os/src/fio.c \
+../../../../third_party/uC-sdk/os/src/hash-djb2.c \
+../../../../third_party/uC-sdk/os/src/init.c \
+../../../../third_party/uC-sdk/os/src/osdebug.c \
+../../../../third_party/uC-sdk/os/src/romfs.c \
+../../../../third_party/uC-sdk/os/src/sbrk.c \
+
+
+CPPFLAGS = -DNOFLOATINGPOINT
+CPPFLAGS += -I.
+CPPFLAGS += -I../../../../third_party/uC-sdk/libc/include
+CPPFLAGS += -I../../../../third_party/uC-sdk/os/include
+CPPFLAGS += -I../../../../third_party/libcester/include
+CPPFLAGS += -I../../openbios/uC-sdk-glue
+
+ifeq ($(PCSX_TESTS),true)
+CPPFLAGS += -DPCSX_TESTS=1
+endif
+
+SRCS += \
+../../common/syscalls/printf.s \
+../../common/crt0/uC-sdk-crt0.s \
+gte.c \
+
+include ../../common.mk
diff --git a/src/mips/tests/gte/gte-avsz.c b/src/mips/tests/gte/gte-avsz.c
new file mode 100644
index 000000000..269517c14
--- /dev/null
+++ b/src/mips/tests/gte/gte-avsz.c
@@ -0,0 +1,82 @@
+// AVSZ3 / AVSZ4: Average Z value computation
+
+CESTER_TEST(avsz3_basic, gte_tests,
+    cop2_put(17, 100);
+    cop2_put(18, 200);
+    cop2_put(19, 300);
+    cop2_putc(29, 0x555);  // ZSF3 ~ 4096/3
+    gte_clear_flag();
+    cop2_cmd(COP2_AVSZ3);
+    int32_t mac0;
+    uint32_t otz;
+    cop2_get(24, mac0);
+    cop2_get(7, otz);
+    cester_assert_int_eq(819000, mac0);
+    cester_assert_uint_eq(199, otz);
+)
+
+CESTER_TEST(avsz4_basic, gte_tests,
+    cop2_put(16, 100);
+    cop2_put(17, 200);
+    cop2_put(18, 300);
+    cop2_put(19, 400);
+    cop2_putc(30, 0x400);  // ZSF4 = 4096/4
+    gte_clear_flag();
+    cop2_cmd(COP2_AVSZ4);
+    int32_t mac0;
+    uint32_t otz;
+    cop2_get(24, mac0);
+    cop2_get(7, otz);
+    cester_assert_int_eq(1024000, mac0);
+    cester_assert_uint_eq(250, otz);
+)
+
+// Verify AVSZ3 uses SZ1+SZ2+SZ3, not SZ0+SZ1+SZ2
+CESTER_TEST(avsz3_uses_sz123, gte_tests,
+    cop2_put(16, 1000);   // SZ0 - should be ignored
+    cop2_put(17, 2000);   // SZ1
+    cop2_put(18, 3000);   // SZ2
+    cop2_put(19, 4000);   // SZ3
+    cop2_putc(29, 0x1000); // ZSF3 = 1.0 in 4.12
+    gte_clear_flag();
+    cop2_cmd(COP2_AVSZ3);
+    int32_t mac0;
+    cop2_get(24, mac0);
+    // SZ1+SZ2+SZ3 = 9000, * 4096 = 36864000
+    cester_assert_int_eq(36864000, mac0);
+)
+
+// OTZ saturation: result > 0xffff
+CESTER_TEST(avsz3_otz_saturate, gte_tests,
+    cop2_put(17, 0xffff);
+    cop2_put(18, 0xffff);
+    cop2_put(19, 0xffff);
+    cop2_putc(29, 0x1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_AVSZ3);
+    uint32_t otz, flag;
+    cop2_get(7, otz);
+    flag = gte_read_flag();
+    cester_assert_uint_eq(0xffff, otz);
+    // FLAG.18 (OTZ saturation) should be set
+    uint32_t flag18 = (flag >> 18) & 1;
+    cester_assert_uint_eq(1, flag18);
+)
+
+// Negative ZSF producing negative MAC0
+CESTER_TEST(avsz3_negative_zsf, gte_tests,
+    cop2_put(17, 100);
+    cop2_put(18, 200);
+    cop2_put(19, 300);
+    cop2_putc(29, 0xf000);  // ZSF3 = negative (sign-extended)
+    gte_clear_flag();
+    cop2_cmd(COP2_AVSZ3);
+    int32_t mac0;
+    uint32_t otz, flag;
+    cop2_get(24, mac0);
+    cop2_get(7, otz);
+    flag = gte_read_flag();
+    ramsyscall_printf("AVSZ3 neg ZSF: MAC0=%d OTZ=%u FLAG=0x%08x\n", mac0, otz, flag);
+    // Negative result should saturate OTZ to 0
+    cester_assert_uint_eq(0, otz);
+)
diff --git a/src/mips/tests/gte/gte-depthcue.c b/src/mips/tests/gte/gte-depthcue.c
new file mode 100644
index 000000000..86629ed5d
--- /dev/null
+++ b/src/mips/tests/gte/gte-depthcue.c
@@ -0,0 +1,215 @@
+// Depth cue instructions: DPCS, DPCT, DCPL, INTPL
+
+// DPCS: depth cue single - interpolates RGBC toward far color using IR0
+CESTER_TEST(dpcs_basic, gte_tests,
+    gte_set_far_color(0x1000, 0x1000, 0x1000);  // FC = (4096, 4096, 4096)
+    cop2_put(6, 0x00808080);  // RGBC: R=0x80, G=0x80, B=0x80
+    cop2_put(8, 0x0800);      // IR0 = 0.5
+    gte_clear_flag();
+    cop2_cmd(COP2_DPCS(1, 0));
+    int32_t mac1, mac2, mac3;
+    uint32_t rgb2;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("DPCS: MAC=(%d,%d,%d) RGB2=0x%08x\n", mac1, mac2, mac3, rgb2);
+    cester_assert_int_eq(3072, mac1);
+    cester_assert_int_eq(3072, mac2);
+    cester_assert_int_eq(3072, mac3);
+    cester_assert_uint_eq(0x00c0c0c0, rgb2);
+    // Formula: MAC = R<<16 + IR0*(FC<<12 - R<<16) >> shift
+    // R<<16 = 0x80<<16 = 0x800000
+    // FC<<12 = 0x1000<<12 = 0x1000000
+    // diff = 0x1000000 - 0x800000 = 0x800000
+    // IR0 * diff = 0x800 * 0x800000 ... this is large
+)
+
+// DPCS with IR0=0: no interpolation, output = input color
+CESTER_TEST(dpcs_ir0_zero, gte_tests,
+    gte_set_far_color(0xff00, 0xff00, 0xff00);
+    cop2_put(6, 0x00406080);  // R=0x80, G=0x60, B=0x40
+    cop2_put(8, 0);           // IR0 = 0
+    gte_clear_flag();
+    cop2_cmd(COP2_DPCS(1, 0));
+    uint32_t rgb2;
+    cop2_get(22, rgb2);
+    uint8_t r = rgb2 & 0xff;
+    uint8_t g = (rgb2 >> 8) & 0xff;
+    uint8_t b = (rgb2 >> 16) & 0xff;
+    // With IR0=0, interpolation weight is 0, so output = input
+    cester_assert_uint_eq(0x80, r);
+    cester_assert_uint_eq(0x60, g);
+    cester_assert_uint_eq(0x40, b);
+)
+
+// DPCS with IR0=0x1000: full interpolation toward far color
+CESTER_TEST(dpcs_ir0_max, gte_tests,
+    gte_set_far_color(0x1000, 0x800, 0x400);  // FC scaled
+    cop2_put(6, 0x00000000);  // RGBC: all zero
+    cop2_put(8, 0x1000);      // IR0 = 1.0
+    gte_clear_flag();
+    cop2_cmd(COP2_DPCS(1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    ramsyscall_printf("DPCS max: MAC=(%d,%d,%d)\n", mac1, mac2, mac3);
+    cester_assert_int_eq(4096, mac1);
+    cester_assert_int_eq(2048, mac2);
+    cester_assert_int_eq(1024, mac3);
+    // With R=0, MAC = 0 + IR0 * (FC<<12 - 0) = 1.0 * FC<<12 >> 12 = FC
+)
+
+// DPCS color FIFO push and CODE preservation
+CESTER_TEST(dpcs_code_preserved, gte_tests,
+    gte_set_far_color(0, 0, 0);
+    cop2_put(6, 0xab102030);  // CODE=0xAB, R=0x30, G=0x20, B=0x10
+    cop2_put(8, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_DPCS(1, 0));
+    uint32_t rgb2;
+    cop2_get(22, rgb2);
+    cester_assert_uint_eq(0xab, (rgb2 >> 24) & 0xff);  // CODE preserved
+)
+
+// DPCT: depth cue triple - reads from color FIFO front (RGB0), not RGBC
+CESTER_TEST(dpct_reads_fifo, gte_tests,
+    gte_set_far_color(0, 0, 0);
+    // Set up color FIFO with known values
+    cop2_put(20, 0x00102030);  // RGB0: R=0x30, G=0x20, B=0x10
+    cop2_put(21, 0x00405060);  // RGB1
+    cop2_put(22, 0x00708090);  // RGB2
+    cop2_put(6, 0xff000000);   // RGBC: CODE=0xff, colors=0 (should NOT be used as input)
+    cop2_put(8, 0);            // IR0=0: output = input
+    gte_clear_flag();
+    cop2_cmd(COP2_DPCT(1, 0));
+    // After 3 iterations, the FIFO has been processed
+    uint32_t rgb0, rgb1, rgb2;
+    cop2_get(20, rgb0);
+    cop2_get(21, rgb1);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("DPCT: RGB0=0x%08x RGB1=0x%08x RGB2=0x%08x\n", rgb0, rgb1, rgb2);
+    // Each iteration: reads R0/G0/B0 (front of FIFO), pushes result
+    // With IR0=0, each iteration's output = its input color
+    // Iteration 1: reads RGB0(0x102030), pushes -> FIFO shifts
+    // Iteration 2: reads new RGB0 (was RGB1: 0x405060), pushes
+    // Iteration 3: reads new RGB0 (was RGB2: 0x708090), pushes
+    // Result FIFO should contain the 3 processed colors
+    // CODE comes from RGBC (0xff)
+    cester_assert_uint_eq(0xff102030, rgb0);
+    cester_assert_uint_eq(0xff405060, rgb1);
+    cester_assert_uint_eq(0xff708090, rgb2);
+)
+
+// DCPL: depth cue with pre-computed light
+CESTER_TEST(dcpl_basic, gte_tests,
+    gte_set_far_color(0x1000, 0x1000, 0x1000);
+    cop2_put(6, 0x00808080);  // RGBC
+    // Pre-computed light in IR1-3
+    cop2_put(9, 0x1000);   // IR1 = 1.0
+    cop2_put(10, 0x0800);  // IR2 = 0.5
+    cop2_put(11, 0x0400);  // IR3 = 0.25
+    cop2_put(8, 0);        // IR0 = 0 (no depth cue)
+    gte_clear_flag();
+    cop2_cmd(COP2_DCPL(1, 0));
+    int32_t mac1, mac2, mac3;
+    uint32_t rgb2;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("DCPL: MAC=(%d,%d,%d) RGB2=0x%08x\n", mac1, mac2, mac3, rgb2);
+    cester_assert_int_eq(2048, mac1);
+    cester_assert_int_eq(1024, mac2);
+    cester_assert_int_eq(512, mac3);
+    cester_assert_uint_eq(0x00204080, rgb2);
+    // With IR0=0: MAC = (R<<4)*IR, no depth cue interpolation
+    // MAC1 = (0x80 << 4) * 0x1000 = 0x800 * 0x1000 = 0x800000
+    // After >>12: 0x800 = 2048 -> IR1, /16 = 128 -> R2
+)
+
+// DCPL with depth cue interpolation
+CESTER_TEST(dcpl_with_depth, gte_tests,
+    gte_set_far_color(0x1000, 0x1000, 0x1000);
+    cop2_put(6, 0x00808080);
+    cop2_put(9, 0x1000);
+    cop2_put(10, 0x1000);
+    cop2_put(11, 0x1000);
+    cop2_put(8, 0x0800);  // IR0 = 0.5
+    gte_clear_flag();
+    cop2_cmd(COP2_DCPL(1, 0));
+    int32_t mac1, mac2, mac3;
+    uint32_t flag;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    flag = gte_read_flag();
+    ramsyscall_printf("DCPL depth: MAC=(%d,%d,%d) FLAG=0x%08x\n", mac1, mac2, mac3, flag);
+    cester_assert_int_eq(3072, mac1);
+    cester_assert_int_eq(3072, mac2);
+    cester_assert_int_eq(3072, mac3);
+    cester_assert_uint_eq(0x00000000, flag);
+)
+
+// INTPL: interpolation (depth cue on IR vector directly)
+CESTER_TEST(intpl_basic, gte_tests,
+    gte_set_far_color(0x1000, 0x2000, 0x3000);
+    cop2_put(9, 0x100);   // IR1
+    cop2_put(10, 0x200);  // IR2
+    cop2_put(11, 0x300);  // IR3
+    cop2_put(8, 0);       // IR0 = 0: no interpolation
+    gte_clear_flag();
+    cop2_cmd(COP2_INTPL(1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // With IR0=0: MAC = IR << 12 >> shift = IR (with sf=1)
+    cester_assert_int_eq(0x100, mac1);
+    cester_assert_int_eq(0x200, mac2);
+    cester_assert_int_eq(0x300, mac3);
+)
+
+CESTER_TEST(intpl_half, gte_tests,
+    gte_set_far_color(0x1000, 0x1000, 0x1000);
+    cop2_put(9, 0);
+    cop2_put(10, 0);
+    cop2_put(11, 0);
+    cop2_put(8, 0x0800);  // IR0 = 0.5
+    gte_clear_flag();
+    cop2_cmd(COP2_INTPL(1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    ramsyscall_printf("INTPL half: MAC=(%d,%d,%d)\n", mac1, mac2, mac3);
+    cester_assert_int_eq(2048, mac1);
+    cester_assert_int_eq(2048, mac2);
+    cester_assert_int_eq(2048, mac3);
+    // IR=0, FC=0x1000, IR0=0.5
+    // MAC = 0 + 0.5*(FC - 0) = 0.5 * 0x1000 = 0x800
+)
+
+// INTPL pushes color FIFO
+CESTER_TEST(intpl_color_push, gte_tests,
+    gte_set_far_color(0, 0, 0);
+    cop2_put(9, 0x0ff0);  // MAC1=0x0ff0, /16 = 255
+    cop2_put(10, 0x0800); // MAC2=0x0800, /16 = 128
+    cop2_put(11, 0x0010); // MAC3=0x0010, /16 = 1
+    cop2_put(8, 0);
+    cop2_put(6, 0xcc000000);  // CODE=0xCC
+    gte_clear_flag();
+    cop2_cmd(COP2_INTPL(1, 0));
+    uint32_t rgb2;
+    cop2_get(22, rgb2);
+    uint8_t cd = (rgb2 >> 24) & 0xff;
+    uint8_t r = rgb2 & 0xff;
+    uint8_t g = (rgb2 >> 8) & 0xff;
+    uint8_t b = (rgb2 >> 16) & 0xff;
+    ramsyscall_printf("INTPL color: R=%u G=%u B=%u CD=0x%02x raw=0x%08x\n", r, g, b, cd, rgb2);
+    cester_assert_uint_eq(255, r);
+    cester_assert_uint_eq(128, g);
+    cester_assert_uint_eq(1, b);
+    cester_assert_uint_eq(0xcc, cd);
+)
diff --git a/src/mips/tests/gte/gte-edgecase.c b/src/mips/tests/gte/gte-edgecase.c
new file mode 100644
index 000000000..9c06bdb79
--- /dev/null
+++ b/src/mips/tests/gte/gte-edgecase.c
@@ -0,0 +1,560 @@
+// Edge cases and degenerate inputs: division, overflow boundaries,
+// zero matrices, negative Z, FLAG verification per instruction.
+
+// ==========================================================================
+// Division edge cases (tested via RTPS)
+// ==========================================================================
+
+// Division by zero: SZ3=0
+CESTER_TEST(edge_div_by_zero, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(0, 0, 200);
+    cop2_put(0, (0 << 16) | 100);
+    cop2_put(1, 0);  // VZ0=0 -> SZ3=0
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sz3, sxy2, flag;
+    cop2_get(19, sz3);
+    cop2_get(14, sxy2);
+    flag = gte_read_flag();
+    ramsyscall_printf("div/0: SZ3=%u SXY2=0x%08x FLAG=0x%08x\n", sz3, sxy2, flag);
+    // SZ3=0, H=200 -> H >= SZ3*2 -> division overflow (FLAG.17)
+    uint32_t f17 = (flag >> 17) & 1;
+    cester_assert_uint_eq(1, f17);
+)
+
+// H=0: zero numerator
+CESTER_TEST(edge_div_h_zero, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(0, 0, 0);  // H=0
+    cop2_put(0, (0 << 16) | 100);
+    cop2_put(1, 1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2, flag;
+    cop2_get(14, sxy2);
+    flag = gte_read_flag();
+    int16_t sx = (int16_t)(sxy2 & 0xffff);
+    ramsyscall_printf("H=0: SX=%d FLAG=0x%08x\n", sx, flag);
+    // H=0, SZ3=1000 -> H < SZ3*2 -> no overflow, quotient = 0
+    // SX = OFX/65536 + IR1 * 0 = 0
+    cester_assert_int_eq(0, sx);
+    uint32_t f17 = (flag >> 17) & 1;
+    cester_assert_uint_eq(0, f17);
+)
+
+// Division overflow boundary: H=SZ3*2-1 (just under, no overflow)
+CESTER_TEST(edge_div_boundary_under, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(0, 0, 199);  // H=199
+    cop2_put(0, (0 << 16) | 100);
+    cop2_put(1, 100);  // SZ3=100 -> H < 200 -> no overflow
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t flag;
+    flag = gte_read_flag();
+    uint32_t f17 = (flag >> 17) & 1;
+    ramsyscall_printf("div boundary under: H=199 SZ3=100 FLAG.17=%u\n", f17);
+    cester_assert_uint_eq(0, f17);
+)
+
+// Division overflow boundary: H=SZ3*2 (exactly at overflow)
+CESTER_TEST(edge_div_boundary_at, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(0, 0, 200);  // H=200
+    cop2_put(0, (0 << 16) | 100);
+    cop2_put(1, 100);  // SZ3=100 -> H >= 200 -> overflow
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t flag;
+    flag = gte_read_flag();
+    uint32_t f17 = (flag >> 17) & 1;
+    ramsyscall_printf("div boundary at: H=200 SZ3=100 FLAG.17=%u\n", f17);
+    cester_assert_uint_eq(1, f17);
+)
+
+// Division overflow boundary: H=SZ3*2+1 (just over, definitely overflow)
+CESTER_TEST(edge_div_boundary_over, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(0, 0, 201);  // H=201
+    cop2_put(0, (0 << 16) | 100);
+    cop2_put(1, 100);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t flag;
+    flag = gte_read_flag();
+    uint32_t f17 = (flag >> 17) & 1;
+    cester_assert_uint_eq(1, f17);
+)
+
+// ==========================================================================
+// IR saturation boundaries
+// ==========================================================================
+
+// IR at exactly 0x7FFF (max positive, no saturation)
+CESTER_TEST(edge_ir_max_no_sat, gte_tests,
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0x7fff);
+    cop2_put(10, 0x7fff);
+    cop2_put(11, 0x7fff);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    uint32_t ir1;
+    cop2_get(9, ir1);
+    uint32_t flag = gte_read_flag();
+    // 0x1000 * 0x7FFF >> 12 = 0x7FFF -> no saturation
+    cester_assert_uint_eq(0x7fff, ir1);
+    // FLAG.24 (IR1 sat) should NOT be set
+    uint32_t f24 = (flag >> 24) & 1;
+    cester_assert_uint_eq(0, f24);
+)
+
+// IR just over 0x7FFF (triggers saturation)
+CESTER_TEST(edge_ir_over_max, gte_tests,
+    cop2_put(8, 0x1001);  // IR0 = 0x1001 (slightly > 1.0)
+    cop2_put(9, 0x7fff);
+    cop2_put(10, 0x100);
+    cop2_put(11, 0x100);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    uint32_t ir1;
+    cop2_get(9, ir1);
+    uint32_t flag = gte_read_flag();
+    ramsyscall_printf("IR over max: IR1=0x%04x FLAG=0x%08x\n", ir1 & 0xffff, flag);
+    // 0x1001 * 0x7FFF >> 12 = 0x8000 -> saturates to 0x7FFF
+    cester_assert_uint_eq(0x7fff, ir1);
+    uint32_t f24 = (flag >> 24) & 1;
+    cester_assert_uint_eq(1, f24);
+)
+
+// ==========================================================================
+// MAC0 overflow boundaries
+// ==========================================================================
+
+// NCLIP with values designed to overflow MAC0
+CESTER_TEST(edge_mac0_positive_overflow, gte_tests,
+    // Maximize cross product: opposing corners of 16-bit range
+    cop2_put(12, (0x7fff << 16) | 0x7fff);  // (32767, 32767)
+    cop2_put(13, (0x8000 << 16) | 0x8000);  // (-32768, -32768)
+    cop2_put(14, 0x00000000);                // (0, 0)
+    gte_clear_flag();
+    cop2_cmd(COP2_NCLIP);
+    int32_t mac0;
+    uint32_t flag;
+    cop2_get(24, mac0);
+    flag = gte_read_flag();
+    // SX0*(SY1-SY2) + SX1*(SY2-SY0) + SX2*(SY0-SY1)
+    // = 32767*(-32768) + (-32768)*(0-32767) + 0
+    // = -1073709056 + (-32768)*(-32767)
+    // = -1073709056 + 1073709056 = 0... hmm, that's zero
+    // Actually: 32767*(-32768-0) + (-32768)*(0-32767) + 0*(32767-(-32768))
+    // = 32767*(-32768) + (-32768)*(-32767)
+    // = -1073709056 + 1073709056 = 0
+    // Need asymmetric triangle for overflow
+    ramsyscall_printf("MAC0 overflow test: MAC0=%d FLAG=0x%08x (F16=%u F15=%u)\n",
+                      mac0, flag, (flag >> 16) & 1, (flag >> 15) & 1);
+    // Cancels to zero - no actual overflow despite the test name
+    cester_assert_int_eq(0, mac0);
+    cester_assert_uint_eq(0x00000000, flag);
+)
+
+// NCLIP that actually overflows MAC0 negatively
+CESTER_TEST(edge_mac0_negative_overflow, gte_tests,
+    // (32767, 32767), (-32768, 32767), (32767, -32768)
+    cop2_put(12, (0x7fff << 16) | 0x7fff);
+    cop2_put(13, (0x7fff << 16) | 0x8000);
+    cop2_put(14, (0x8000 << 16) | 0x7fff);
+    gte_clear_flag();
+    cop2_cmd(COP2_NCLIP);
+    int32_t mac0;
+    uint32_t flag;
+    cop2_get(24, mac0);
+    flag = gte_read_flag();
+    ramsyscall_printf("MAC0 neg overflow: MAC0=%d FLAG=0x%08x\n", mac0, flag);
+    // The cross product should be large negative
+    // FLAG.15 (MAC0 negative overflow) should be set
+    cester_assert_int_eq(-131071, mac0);
+    // FLAG.16 set from intermediate positive overflow in NCLIP's chained additions
+    uint32_t f16 = (flag >> 16) & 1;
+    cester_assert_uint_eq(1, f16);
+)
+
+// ==========================================================================
+// Color saturation boundaries
+// ==========================================================================
+
+// Color output at exactly 255 (no saturation)
+CESTER_TEST(edge_color_at_255, gte_tests,
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0x0ff0);   // MAC1 = 0x0ff0, /16 = 255
+    cop2_put(10, 0x0ff0);
+    cop2_put(11, 0x0ff0);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    uint32_t rgb2, flag;
+    cop2_get(22, rgb2);
+    flag = gte_read_flag();
+    uint32_t r_255 = rgb2 & 0xff;
+    cester_assert_uint_eq(255, r_255);
+    uint32_t f21_255 = (flag >> 21) & 1;
+    cester_assert_uint_eq(0, f21_255);  // No color saturation flag
+)
+
+// Color output at 256 (saturates to 255, FLAG set)
+CESTER_TEST(edge_color_at_256, gte_tests,
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0x1000);   // MAC1 = 0x1000, /16 = 256 -> saturates
+    cop2_put(10, 0x100);
+    cop2_put(11, 0x100);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    uint32_t rgb2, flag;
+    cop2_get(22, rgb2);
+    flag = gte_read_flag();
+    uint32_t r_256 = rgb2 & 0xff;
+    cester_assert_uint_eq(255, r_256);  // saturated to 255
+    uint32_t f21_256 = (flag >> 21) & 1;
+    cester_assert_uint_eq(1, f21_256);  // R saturation flag set
+)
+
+// Negative color (saturates to 0, FLAG set)
+CESTER_TEST(edge_color_negative, gte_tests,
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0xffff8000);  // IR1 = -32768 -> negative MAC1 -> color=0
+    cop2_put(10, 0x100);
+    cop2_put(11, 0x100);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    uint32_t rgb2, flag;
+    cop2_get(22, rgb2);
+    flag = gte_read_flag();
+    uint32_t r_neg = rgb2 & 0xff;
+    cester_assert_uint_eq(0, r_neg);  // clamped to 0
+    uint32_t f21_neg = (flag >> 21) & 1;
+    cester_assert_uint_eq(1, f21_neg);  // Color R saturation flag
+)
+
+// ==========================================================================
+// Screen coordinate saturation
+// ==========================================================================
+
+// SX at exactly 0x3FF (max, no saturation)
+CESTER_TEST(edge_sx_at_max, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0x3ff << 16);  // OFX = 0x3FF in 16.16
+    cop2_putc(25, 0);
+    cop2_putc(26, 0);  // H=0 -> quotient=0 -> SX = OFX only
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    cop2_put(0, 0);
+    cop2_put(1, 1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2, flag;
+    cop2_get(14, sxy2);
+    flag = gte_read_flag();
+    int16_t sx = (int16_t)(sxy2 & 0xffff);
+    cester_assert_int_eq(0x3ff, sx);
+    uint32_t f14 = (flag >> 14) & 1;
+    cester_assert_uint_eq(0, f14);  // no saturation
+)
+
+// SX at 0x400 (saturates to 0x3FF)
+CESTER_TEST(edge_sx_over_max, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0x400 << 16);  // OFX = 0x400
+    cop2_putc(25, 0);
+    cop2_putc(26, 0);
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    cop2_put(0, 0);
+    cop2_put(1, 1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2, flag;
+    cop2_get(14, sxy2);
+    flag = gte_read_flag();
+    int16_t sx = (int16_t)(sxy2 & 0xffff);
+    cester_assert_int_eq(0x3ff, sx);  // saturated
+    uint32_t f14 = (flag >> 14) & 1;
+    cester_assert_uint_eq(1, f14);
+)
+
+// SY at -0x400 (min, no saturation)
+CESTER_TEST(edge_sy_at_min, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0);
+    cop2_putc(25, (uint32_t)(-0x400) << 16);  // OFY = -0x400
+    cop2_putc(26, 0);
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    cop2_put(0, 0);
+    cop2_put(1, 1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2, flag;
+    cop2_get(14, sxy2);
+    flag = gte_read_flag();
+    int16_t sy = (int16_t)(sxy2 >> 16);
+    cester_assert_int_eq(-0x400, sy);
+    uint32_t f13 = (flag >> 13) & 1;
+    cester_assert_uint_eq(0, f13);
+)
+
+// ==========================================================================
+// Degenerate matrix states
+// ==========================================================================
+
+// Zero rotation matrix: everything should become translation only
+CESTER_TEST(edge_zero_matrix, gte_tests,
+    cop2_putc(0, 0);
+    cop2_putc(1, 0);
+    cop2_putc(2, 0);
+    cop2_putc(3, 0);
+    cop2_putc(4, 0);
+    gte_set_translation(100, 200, 300);
+    cop2_put(0, (0x7fff << 16) | 0x7fff);  // large vertex
+    cop2_put(1, 0x7fff);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, COP2_MX_RT, COP2_V_V0, COP2_CV_TR, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // Zero matrix * anything = 0, plus translation
+    cester_assert_int_eq(100, mac1);
+    cester_assert_int_eq(200, mac2);
+    cester_assert_int_eq(300, mac3);
+)
+
+// Max magnitude matrix elements
+CESTER_TEST(edge_max_matrix, gte_tests,
+    cop2_putc(0, 0x7fff7fff);  // R11=R12=0x7FFF
+    cop2_putc(1, 0x7fff7fff);
+    cop2_putc(2, 0x7fff7fff);
+    cop2_putc(3, 0x7fff7fff);
+    cop2_putc(4, 0x7fff);
+    gte_set_translation(0, 0, 0);
+    cop2_put(0, (0x7fff << 16) | 0x7fff);
+    cop2_put(1, 0x7fff);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, COP2_MX_RT, COP2_V_V0, COP2_CV_NONE, 0));
+    int32_t mac1;
+    uint32_t flag;
+    cop2_get(25, mac1);
+    flag = gte_read_flag();
+    ramsyscall_printf("max matrix: MAC1=%d FLAG=0x%08x\n", mac1, flag);
+    // 3 * 0x7FFF * 0x7FFF = 3 * 1073676289 = 3221028867
+    // >> 12 = 786380, fits in 32-bit MAC. But 44-bit accumulator overflow?
+    cester_assert_int_eq(786384, mac1);
+    cester_assert_uint_eq(0x81c00000, flag);
+)
+
+// Negative Z in RTPS (behind camera)
+CESTER_TEST(edge_negative_z, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, -1000);  // TRZ = -1000
+    gte_set_screen(160 << 16, 120 << 16, 200);
+    cop2_put(0, (0 << 16) | 100);
+    cop2_put(1, 0);  // VZ=0, MAC3 = TRZ = -1000
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sz3, flag;
+    int32_t mac3;
+    cop2_get(19, sz3);
+    cop2_get(27, mac3);
+    flag = gte_read_flag();
+    ramsyscall_printf("neg Z: MAC3=%d SZ3=%u FLAG=0x%08x\n", mac3, sz3, flag);
+    // MAC3 = -1000, SZ3 should saturate to 0 (Lm_D clamps to [0, 0xFFFF])
+    cester_assert_int_eq(-1000, mac3);
+    cester_assert_uint_eq(0, sz3);  // saturated
+    uint32_t f18 = (flag >> 18) & 1;
+    cester_assert_uint_eq(1, f18);  // OTZ/SZ3 saturation
+)
+
+// SQR of -0x8000 (minimum 16-bit signed)
+CESTER_TEST(edge_sqr_min_negative, gte_tests,
+    cop2_put(9, 0xffff8000);  // IR1 = -32768
+    cop2_put(10, 0);
+    cop2_put(11, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(0, 0));
+    int32_t mac1;
+    uint32_t flag;
+    cop2_get(25, mac1);
+    flag = gte_read_flag();
+    // (-32768)^2 = 1073741824 = 0x40000000 (fits in 32-bit signed)
+    ramsyscall_printf("SQR(-32768): MAC1=%d FLAG=0x%08x\n", mac1, flag);
+    cester_assert_int_eq(1073741824, mac1);
+)
+
+// GPL with negative MAC base
+CESTER_TEST(edge_gpl_negative_base, gte_tests,
+    cop2_put(25, -10000);  // MAC1 = -10000
+    cop2_put(26, -20000);
+    cop2_put(27, -30000);
+    cop2_put(8, 0x1000);  // IR0 = 1.0
+    cop2_put(9, 100);
+    cop2_put(10, 200);
+    cop2_put(11, 300);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPL(1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // GPL sf=1: MAC = (old_MAC << 12 + IR0*IR) >> 12
+    // = ((-10000 << 12) + 4096*100) >> 12
+    // = (-40960000 + 409600) >> 12
+    // = -40550400 >> 12 = -9900
+    cester_assert_int_eq(-9900, mac1);
+    cester_assert_int_eq(-19800, mac2);
+    cester_assert_int_eq(-29700, mac3);
+)
+
+// ==========================================================================
+// FLAG cleared at instruction start
+// ==========================================================================
+
+// Verify FLAG is reset to 0 at the start of each GTE instruction,
+// not accumulating from previous instructions
+CESTER_TEST(edge_flag_cleared_each_instruction, gte_tests,
+    // First: trigger IR1 saturation via GPF
+    cop2_put(8, 0x1001);
+    cop2_put(9, 0x7fff);
+    cop2_put(10, 0x100);
+    cop2_put(11, 0x100);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    uint32_t flag1 = gte_read_flag();
+    uint32_t f24_1 = (flag1 >> 24) & 1;
+    cester_assert_uint_eq(1, f24_1);  // IR1 saturated
+
+    // Now: run a clean GPF that should NOT trigger any flags
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0x100);
+    cop2_put(10, 0x100);
+    cop2_put(11, 0x100);
+    cop2_put(6, 0x00808080);
+    // Do NOT call gte_clear_flag() - the instruction should clear it itself
+    cop2_cmd(COP2_GPF(1, 0));
+    uint32_t flag2 = gte_read_flag();
+    // FLAG should be 0 - the instruction clears it at start
+    cester_assert_uint_eq(0, flag2);
+)
+
+// ==========================================================================
+// IR0 saturation boundary
+// ==========================================================================
+
+// IR0 at exactly 0x1000 (max, no saturation)
+CESTER_TEST(edge_ir0_at_max, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0);
+    cop2_putc(25, 0);
+    cop2_putc(26, 200);
+    cop2_putc(27, 0);          // DQA = 0
+    cop2_putc(28, 0x1000000);  // DQB = 0x1000000 -> MAC0=DQB, IR0=DQB>>12=0x1000
+    cop2_put(0, 0);
+    cop2_put(1, 1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t ir0, flag;
+    cop2_get(8, ir0);
+    flag = gte_read_flag();
+    ramsyscall_printf("IR0 max: IR0=0x%04x FLAG=0x%08x\n", ir0 & 0xffff, flag);
+    // IR0 should be exactly 0x1000
+    uint32_t f12 = (flag >> 12) & 1;
+    cester_assert_uint_eq(0, f12);  // no saturation
+)
+
+// ==========================================================================
+// OTZ saturation boundary
+// ==========================================================================
+
+// OTZ at exactly 0xFFFF (max, triggers saturation)
+CESTER_TEST(edge_otz_at_max, gte_tests,
+    // Need MAC0 >> 12 = 0xFFFF -> MAC0 = 0xFFFF << 12 = 0xFFFF000
+    // ZSF3 * (SZ1+SZ2+SZ3) = 0xFFFF000
+    // Use ZSF3 = 0x1000, SZ_sum = 0xFFFF -> each SZ = 0x5555
+    cop2_put(17, 0x5555);
+    cop2_put(18, 0x5555);
+    cop2_put(19, 0x5555);
+    cop2_putc(29, 0x1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_AVSZ3);
+    uint32_t otz, flag;
+    cop2_get(7, otz);
+    flag = gte_read_flag();
+    ramsyscall_printf("OTZ max: OTZ=%u FLAG=0x%08x\n", otz, flag);
+    // 0x5555*3 = 0xFFFF, * 0x1000 = 0xFFFF000, >> 12 = 0xFFFF
+    cester_assert_uint_eq(0xffff, otz);
+)
+
+// ==========================================================================
+// Depth cue inner clamp (FC - input can go negative)
+// ==========================================================================
+
+// DPCS where FC << input color (FC-input negative, inner lm=0 clamp)
+CESTER_TEST(edge_depthcue_fc_less_than_input, gte_tests,
+    gte_set_far_color(0, 0, 0);  // FC = 0 (dark fog)
+    cop2_put(6, 0x00ffffff);     // RGBC: R=G=B=0xFF (bright)
+    cop2_put(8, 0x0800);         // IR0 = 0.5
+    gte_clear_flag();
+    cop2_cmd(COP2_DPCS(1, 0));
+    int32_t mac1;
+    uint32_t rgb2, flag;
+    cop2_get(25, mac1);
+    cop2_get(22, rgb2);
+    flag = gte_read_flag();
+    ramsyscall_printf("DPCS FC<input: MAC1=%d RGB2=0x%08x FLAG=0x%08x\n", mac1, rgb2, flag);
+    // FC=0, R=0xFF: diff = (0<<12) - (0xFF<<16) = -0xFF0000 (negative)
+    // Inner clamp (lm=0): clamps to [-0x8000, 0x7FFF]
+    // Then IR0 * clamped_diff + R<<16 -> should produce intermediate result
+    cester_assert_int_eq(2040, mac1);
+    cester_assert_uint_eq(0x007f7f7f, rgb2);
+    cester_assert_uint_eq(0x00000000, flag);
+)
+
+// ==========================================================================
+// INTPL where FC < IR (interpolation goes backward)
+// ==========================================================================
+
+CESTER_TEST(edge_intpl_fc_less_than_ir, gte_tests,
+    gte_set_far_color(0, 0, 0);  // FC = 0
+    cop2_put(9, 0x1000);  // IR = 0x1000 (> FC)
+    cop2_put(10, 0x1000);
+    cop2_put(11, 0x1000);
+    cop2_put(8, 0x0800);  // IR0 = 0.5
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_INTPL(1, 0));
+    int32_t mac1;
+    uint32_t flag;
+    cop2_get(25, mac1);
+    flag = gte_read_flag();
+    ramsyscall_printf("INTPL FC<IR: MAC1=%d FLAG=0x%08x\n", mac1, flag);
+    // IR=0x1000, FC=0, IR0=0.5
+    // diff = (0<<12) - (0x1000<<12) = -0x1000000
+    // inner clamp: -0x1000000 >> 12 = -0x1000 -> clamped to -0x1000 (in range)
+    // MAC = 0x1000<<12 + 0x800 * (-0x1000) = 0x1000000 + (-0x800000)
+    // >> 12 = (0x800000) >> 12 = 0x800 = 2048
+    cester_assert_int_eq(2048, mac1);
+    cester_assert_uint_eq(0x00000000, flag);
+)
diff --git a/src/mips/tests/gte/gte-encoding.c b/src/mips/tests/gte/gte-encoding.c
new file mode 100644
index 000000000..64d9962c3
--- /dev/null
+++ b/src/mips/tests/gte/gte-encoding.c
@@ -0,0 +1,369 @@
+// GTE instruction encoding tests: systematic sweep of bitfield parameters.
+//
+// Helper macros for unrolled MVMVA sweeps. Defined at file scope so they
+// survive cester's double-include of __BASE_FILE__.
+
+#define MVMVA_T(mx, v, cv) do { \
+    if ((v) == 3) { cop2_put(9, 0x100); cop2_put(10, 0x200); cop2_put(11, 0x300); } \
+    gte_clear_flag(); \
+    cop2_cmd(COP2_MVMVA(1, mx, v, cv, 0)); \
+} while (0)
+
+#define MVMVA_MX3_V(v) do { \
+    if ((v) == 3) { cop2_put(9, 0x400); cop2_put(10, 0x500); cop2_put(11, 0x600); } \
+    gte_clear_flag(); \
+    cop2_cmd(COP2_MVMVA(1, 3, v, 3, 0)); \
+    int32_t _m1, _m2, _m3; \
+    cop2_get(25, _m1); cop2_get(26, _m2); cop2_get(27, _m3); \
+    ramsyscall_printf("MVMVA mx=3 v=%d: MAC=(%d,%d,%d)\n", v, _m1, _m2, _m3); \
+} while (0)
+
+#define MVMVA_CV2_MX(mx) do { \
+    cop2_put(9, 0x100); cop2_put(10, 0x200); cop2_put(11, 0x300); \
+    gte_clear_flag(); \
+    cop2_cmd(COP2_MVMVA(1, mx, 0, 2, 0)); \
+    int32_t _m1, _m2, _m3; uint32_t _fl; \
+    cop2_get(25, _m1); cop2_get(26, _m2); cop2_get(27, _m3); _fl = gte_read_flag(); \
+    ramsyscall_printf("MVMVA mx=%d cv=2: MAC=(%d,%d,%d) FLAG=0x%08x\n", mx, _m1, _m2, _m3, _fl); \
+} while (0)
+//
+// The GTE command word is a 25-bit immediate with fields:
+//   [fake:5][sf:1][mx:2][v:2][cv:2][pad:2][lm:1][pad:4][fn:6]
+//
+// These tests verify:
+// 1. The "fake" field (bits 24-20) is ignored by hardware
+// 2. sf=0 vs sf=1 behavior for each function code
+// 3. lm=0 vs lm=1 behavior for each function code
+// 4. All MVMVA mx/v/cv combinations produce results
+// 5. Unused bitfield values don't crash
+
+// ==========================================================================
+// Fake field is ignored by hardware
+// ==========================================================================
+
+// Run RTPS with fake=0 (non-standard) and verify same result as fake=1
+CESTER_TEST(enc_fake_field_ignored_rtps, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 1000);
+    gte_set_screen(160 << 16, 120 << 16, 200);
+    cop2_put(0, 0);
+    cop2_put(1, 0);
+
+    // Standard encoding: fake=1, sf=1
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2_std;
+    cop2_get(14, sxy2_std);
+
+    // Non-standard: fake=0, same sf/fn
+    cop2_put(0, 0);
+    cop2_put(1, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_OP(0, 1, 0, 0, 0, 0, COP2_FN_RTPS));
+    uint32_t sxy2_alt;
+    cop2_get(14, sxy2_alt);
+
+    cester_assert_uint_eq(sxy2_std, sxy2_alt);
+)
+
+// Run GPF with fake=31 (max) vs standard fake=25
+CESTER_TEST(enc_fake_field_ignored_gpf, gte_tests,
+    cop2_put(8, 0x1000);
+    cop2_put(9, 100);
+    cop2_put(10, 200);
+    cop2_put(11, 300);
+    cop2_put(6, 0x00808080);
+
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    int32_t mac1_std;
+    cop2_get(25, mac1_std);
+
+    cop2_put(8, 0x1000);
+    cop2_put(9, 100);
+    cop2_put(10, 200);
+    cop2_put(11, 300);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_OP(31, 1, 0, 0, 0, 0, COP2_FN_GPF));
+    int32_t mac1_alt;
+    cop2_get(25, mac1_alt);
+
+    cester_assert_int_eq(mac1_std, mac1_alt);
+)
+
+// ==========================================================================
+// sf=0 vs sf=1 for each instruction
+// ==========================================================================
+
+// GPF: sf changes shift behavior
+CESTER_TEST(enc_gpf_sf_difference, gte_tests,
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0x1000);
+    cop2_put(10, 0x1000);
+    cop2_put(11, 0x1000);
+    cop2_put(6, 0x00808080);
+
+    // sf=1: MAC = (IR0*IR) >> 12 = (0x1000*0x1000)>>12 = 0x1000
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    int32_t mac1_sf1;
+    cop2_get(25, mac1_sf1);
+
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0x1000);
+    cop2_put(10, 0x1000);
+    cop2_put(11, 0x1000);
+    cop2_put(6, 0x00808080);
+
+    // sf=0: MAC = IR0*IR = 0x1000*0x1000 = 0x1000000
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(0, 0));
+    int32_t mac1_sf0;
+    cop2_get(25, mac1_sf0);
+
+    cester_assert_int_eq(0x1000, mac1_sf1);
+    cester_assert_int_eq(0x1000000, mac1_sf0);
+)
+
+// SQR: sf changes shift
+CESTER_TEST(enc_sqr_sf_difference, gte_tests,
+    cop2_put(9, 0x100);
+    cop2_put(10, 0x100);
+    cop2_put(11, 0x100);
+
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(1, 0));
+    int32_t mac1_sf1;
+    cop2_get(25, mac1_sf1);
+
+    cop2_put(9, 0x100);
+    cop2_put(10, 0x100);
+    cop2_put(11, 0x100);
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(0, 0));
+    int32_t mac1_sf0;
+    cop2_get(25, mac1_sf0);
+
+    // sf=1: (0x100*0x100)>>12 = 0x10000>>12 = 0x10
+    // sf=0: 0x100*0x100 = 0x10000
+    cester_assert_int_eq(0x10, mac1_sf1);
+    cester_assert_int_eq(0x10000, mac1_sf0);
+)
+
+// OP: sf changes shift
+CESTER_TEST(enc_op_sf_difference, gte_tests,
+    cop2_putc(0, 0x00001000);
+    cop2_putc(2, 0x00002000);
+    cop2_putc(4, 0x1000);
+    cop2_put(9, 100);
+    cop2_put(10, 0);
+    cop2_put(11, 0);
+
+    gte_clear_flag();
+    cop2_cmd(COP2_OP_CP(1, 0));
+    int32_t mac2_sf1;
+    cop2_get(26, mac2_sf1);
+
+    cop2_put(9, 100);
+    cop2_put(10, 0);
+    cop2_put(11, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_OP_CP(0, 0));
+    int32_t mac2_sf0;
+    cop2_get(26, mac2_sf0);
+
+    // sf=1: MAC2 = (R33*IR1 - R11*IR3)>>12 = (0x1000*100 - 0x1000*0)>>12 = 100
+    // sf=0: MAC2 = R33*IR1 - R11*IR3 = 0x1000*100 = 409600
+    cester_assert_int_eq(100, mac2_sf1);
+    cester_assert_int_eq(409600, mac2_sf0);
+)
+
+// ==========================================================================
+// lm=0 vs lm=1 for each instruction
+// ==========================================================================
+
+// SQR: lm=1 clamps IR to [0, 0x7fff]
+CESTER_TEST(enc_sqr_lm_difference, gte_tests,
+    cop2_put(9, 0x2000);  // 2.0
+    cop2_put(10, 0x2000);
+    cop2_put(11, 0x2000);
+
+    // sf=1, lm=0: 2.0^2 = 4.0 = 0x4000 (in range for signed)
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(1, 0));
+    uint32_t ir1_lm0;
+    cop2_get(9, ir1_lm0);
+
+    cop2_put(9, 0x2000);
+    cop2_put(10, 0x2000);
+    cop2_put(11, 0x2000);
+
+    // sf=1, lm=1: same result since 0x4000 > 0 (lm=1 only clamps negative to 0)
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(1, 1));
+    uint32_t ir1_lm1;
+    cop2_get(9, ir1_lm1);
+
+    // Both should be 0x4000 since result is positive
+    cester_assert_uint_eq(0x4000, ir1_lm0);
+    cester_assert_uint_eq(0x4000, ir1_lm1);
+)
+
+// ==========================================================================
+// MVMVA: all mx/v/cv combinations (4 x 4 x 4 = 64 combos)
+// ==========================================================================
+
+// Sweep all 64 MVMVA parameter combinations and verify no crash.
+// Log MAC results for ground truth capture.
+CESTER_TEST(enc_mvmva_full_sweep, gte_tests,
+    // Set up all matrices and vectors with known non-zero values
+    // RT matrix
+    cop2_putc(0, 0x08001000);
+    cop2_putc(1, 0x02000400);
+    cop2_putc(2, 0x08001000);
+    cop2_putc(3, 0x02000400);
+    cop2_putc(4, 0x1000);
+    // LL matrix
+    cop2_putc(8, 0x04000800);
+    cop2_putc(9, 0x01000200);
+    cop2_putc(10, 0x04000800);
+    cop2_putc(11, 0x01000200);
+    cop2_putc(12, 0x0800);
+    // LC matrix
+    cop2_putc(16, 0x02000400);
+    cop2_putc(17, 0x00800100);
+    cop2_putc(18, 0x02000400);
+    cop2_putc(19, 0x00800100);
+    cop2_putc(20, 0x0400);
+    // Vectors
+    cop2_put(0, (0x200 << 16) | 0x100);  // V0
+    cop2_put(1, 0x300);
+    cop2_put(2, (0x500 << 16) | 0x400);  // V1
+    cop2_put(3, 0x600);
+    cop2_put(4, (0x800 << 16) | 0x700);  // V2
+    cop2_put(5, 0x900);
+    cop2_put(9, 0x100);   // IR1
+    cop2_put(10, 0x200);  // IR2
+    cop2_put(11, 0x300);  // IR3
+    cop2_put(8, 0x0800);  // IR0
+    // Control vectors
+    gte_set_translation(100, 200, 300);
+    cop2_putc(13, 400);
+    cop2_putc(14, 500);
+    cop2_putc(15, 600);
+    gte_set_far_color(700, 800, 900);
+
+    // All 64 MVMVA combos unrolled (cop2_cmd requires compile-time constants).
+    MVMVA_T(0,0,0); MVMVA_T(0,0,1); MVMVA_T(0,0,2); MVMVA_T(0,0,3);
+    MVMVA_T(0,1,0); MVMVA_T(0,1,1); MVMVA_T(0,1,2); MVMVA_T(0,1,3);
+    MVMVA_T(0,2,0); MVMVA_T(0,2,1); MVMVA_T(0,2,2); MVMVA_T(0,2,3);
+    MVMVA_T(0,3,0); MVMVA_T(0,3,1); MVMVA_T(0,3,2); MVMVA_T(0,3,3);
+    MVMVA_T(1,0,0); MVMVA_T(1,0,1); MVMVA_T(1,0,2); MVMVA_T(1,0,3);
+    MVMVA_T(1,1,0); MVMVA_T(1,1,1); MVMVA_T(1,1,2); MVMVA_T(1,1,3);
+    MVMVA_T(1,2,0); MVMVA_T(1,2,1); MVMVA_T(1,2,2); MVMVA_T(1,2,3);
+    MVMVA_T(1,3,0); MVMVA_T(1,3,1); MVMVA_T(1,3,2); MVMVA_T(1,3,3);
+    MVMVA_T(2,0,0); MVMVA_T(2,0,1); MVMVA_T(2,0,2); MVMVA_T(2,0,3);
+    MVMVA_T(2,1,0); MVMVA_T(2,1,1); MVMVA_T(2,1,2); MVMVA_T(2,1,3);
+    MVMVA_T(2,2,0); MVMVA_T(2,2,1); MVMVA_T(2,2,2); MVMVA_T(2,2,3);
+    MVMVA_T(2,3,0); MVMVA_T(2,3,1); MVMVA_T(2,3,2); MVMVA_T(2,3,3);
+    MVMVA_T(3,0,0); MVMVA_T(3,0,1); MVMVA_T(3,0,2); MVMVA_T(3,0,3);
+    MVMVA_T(3,1,0); MVMVA_T(3,1,1); MVMVA_T(3,1,2); MVMVA_T(3,1,3);
+    MVMVA_T(3,2,0); MVMVA_T(3,2,1); MVMVA_T(3,2,2); MVMVA_T(3,2,3);
+    MVMVA_T(3,3,0); MVMVA_T(3,3,1); MVMVA_T(3,3,2); MVMVA_T(3,3,3);
+    cester_assert_int_eq(1, 1); // if we got here, none crashed
+)
+
+// ==========================================================================
+// MVMVA mx=3 (garbage matrix) with all vector/cv combinations
+// ==========================================================================
+
+CESTER_TEST(enc_mvmva_mx3_all_vectors, gte_tests,
+    cop2_putc(0, 0x20001000);
+    cop2_putc(1, 0x40003000);
+    cop2_putc(2, 0x60005000);
+    cop2_putc(3, 0x80007000);
+    cop2_putc(4, 0x1000);
+    cop2_put(8, 0x0800);
+    cop2_put(0, (0x100 << 16) | 0x100);
+    cop2_put(1, 0x100);
+    cop2_put(2, (0x200 << 16) | 0x200);
+    cop2_put(3, 0x200);
+    cop2_put(4, (0x300 << 16) | 0x300);
+    cop2_put(5, 0x300);
+    cop2_put(9, 0x400);
+    cop2_put(10, 0x500);
+    cop2_put(11, 0x600);
+
+    MVMVA_MX3_V(0); MVMVA_MX3_V(1); MVMVA_MX3_V(2); MVMVA_MX3_V(3);
+    cester_assert_int_eq(1, 1);
+)
+
+// ==========================================================================
+// MVMVA cv=2 (FC bug) with all matrix/vector combinations
+// ==========================================================================
+
+CESTER_TEST(enc_mvmva_cv2_all_matrices, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_simple_light();
+    gte_set_white_light_color();
+    gte_set_far_color(0x1000, 0x2000, 0x3000);
+    cop2_put(0, (0x200 << 16) | 0x100);
+    cop2_put(1, 0x300);
+    cop2_put(9, 0x100);
+    cop2_put(10, 0x200);
+    cop2_put(11, 0x300);
+
+    MVMVA_CV2_MX(0); MVMVA_CV2_MX(1); MVMVA_CV2_MX(2);
+    cester_assert_int_eq(1, 1);
+)
+
+// ==========================================================================
+// Instructions that ignore sf/lm should produce identical results
+// ==========================================================================
+
+// NCLIP ignores sf and lm
+CESTER_TEST(enc_nclip_ignores_sf_lm, gte_tests,
+    cop2_put(12, 0x00000000);
+    cop2_put(13, 0x00000064);
+    cop2_put(14, 0x00640000);
+
+    gte_clear_flag();
+    cop2_cmd(COP2_OP(20, 0, 0, 0, 0, 0, COP2_FN_NCLIP));  // standard
+    int32_t mac0_std;
+    cop2_get(24, mac0_std);
+
+    cop2_put(12, 0x00000000);
+    cop2_put(13, 0x00000064);
+    cop2_put(14, 0x00640000);
+    gte_clear_flag();
+    cop2_cmd(COP2_OP(0, 1, 3, 3, 3, 1, COP2_FN_NCLIP));   // all bits set
+    int32_t mac0_alt;
+    cop2_get(24, mac0_alt);
+
+    cester_assert_int_eq(mac0_std, mac0_alt);
+)
+
+// AVSZ3 ignores sf and lm (uses fixed >>12)
+CESTER_TEST(enc_avsz3_ignores_sf_lm, gte_tests,
+    cop2_put(17, 100);
+    cop2_put(18, 200);
+    cop2_put(19, 300);
+    cop2_putc(29, 0x555);
+
+    gte_clear_flag();
+    cop2_cmd(COP2_AVSZ3);
+    int32_t mac0_std;
+    cop2_get(24, mac0_std);
+
+    cop2_put(17, 100);
+    cop2_put(18, 200);
+    cop2_put(19, 300);
+    cop2_putc(29, 0x555);
+    gte_clear_flag();
+    cop2_cmd(COP2_OP(0, 0, 3, 3, 3, 1, COP2_FN_AVSZ3));
+    int32_t mac0_alt;
+    cop2_get(24, mac0_alt);
+
+    cester_assert_int_eq(mac0_std, mac0_alt);
+)
diff --git a/src/mips/tests/gte/gte-gpf-gpl.c b/src/mips/tests/gte/gte-gpf-gpl.c
new file mode 100644
index 000000000..f262a9074
--- /dev/null
+++ b/src/mips/tests/gte/gte-gpf-gpl.c
@@ -0,0 +1,148 @@
+// GPF: general purpose interpolation (IR0 * IR -> MAC/IR, push color)
+// GPL: general purpose interpolation with base (MAC + IR0 * IR -> MAC/IR, push color)
+
+CESTER_TEST(gpf_shifted_unity, gte_tests,
+    cop2_put(8, 0x1000);  // IR0 = 1.0
+    cop2_put(9, 100);
+    cop2_put(10, 200);
+    cop2_put(11, 300);
+    cop2_put(6, 0x00204060);  // RGBC
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cester_assert_int_eq(100, mac1);
+    cester_assert_int_eq(200, mac2);
+    cester_assert_int_eq(300, mac3);
+)
+
+CESTER_TEST(gpf_shifted_half, gte_tests,
+    cop2_put(8, 0x0800);  // IR0 = 0.5
+    cop2_put(9, 1000);
+    cop2_put(10, 2000);
+    cop2_put(11, 4000);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // IR0*IR >> 12 = 0x800*IR >> 12 = IR/2
+    cester_assert_int_eq(500, mac1);
+    cester_assert_int_eq(1000, mac2);
+    cester_assert_int_eq(2000, mac3);
+)
+
+// GPF pushes color FIFO
+CESTER_TEST(gpf_color_fifo_push, gte_tests,
+    cop2_put(8, 0x1000);  // IR0 = 1.0
+    cop2_put(9, 0x0800);  // IR1 -> MAC1=0x800, /16=128
+    cop2_put(10, 0x0400); // IR2 -> MAC2=0x400, /16=64
+    cop2_put(11, 0x0200); // IR3 -> MAC3=0x200, /16=32
+    cop2_put(6, 0xaa000000);  // RGBC: CODE=0xaa
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    uint32_t rgb2;
+    cop2_get(22, rgb2);
+    uint8_t r = rgb2 & 0xff;
+    uint8_t g = (rgb2 >> 8) & 0xff;
+    uint8_t b = (rgb2 >> 16) & 0xff;
+    uint8_t cd = (rgb2 >> 24) & 0xff;
+    ramsyscall_printf("GPF color: R=%u G=%u B=%u CD=0x%02x\n", r, g, b, cd);
+    cester_assert_uint_eq(0xaa, cd);  // CODE byte preserved
+    // R = MAC1/16 = 0x800/16 = 128
+    cester_assert_uint_eq(128, r);
+    cester_assert_uint_eq(64, g);
+    cester_assert_uint_eq(32, b);
+)
+
+// GPF unshifted (sf=0)
+CESTER_TEST(gpf_unshifted, gte_tests,
+    cop2_put(8, 2);  // IR0 = 2
+    cop2_put(9, 100);
+    cop2_put(10, 200);
+    cop2_put(11, 300);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(0, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // sf=0: no shift, MAC = IR0*IR
+    cester_assert_int_eq(200, mac1);
+    cester_assert_int_eq(400, mac2);
+    cester_assert_int_eq(600, mac3);
+)
+
+// GPL shifted with base
+CESTER_TEST(gpl_shifted, gte_tests,
+    cop2_put(25, 1000);  // MAC1 base
+    cop2_put(26, 2000);  // MAC2 base
+    cop2_put(27, 3000);  // MAC3 base
+    cop2_put(8, 0x1000); // IR0 = 1.0
+    cop2_put(9, 100);
+    cop2_put(10, 200);
+    cop2_put(11, 300);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPL(1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // GPL sf=1: MAC_new = (MAC_old << 12 + IR0 * IR) >> 12
+    // = ((1000<<12) + 4096*100) >> 12 = (4096000+409600)>>12 = 1100
+    cester_assert_int_eq(1100, mac1);
+    cester_assert_int_eq(2200, mac2);
+    cester_assert_int_eq(3300, mac3);
+)
+
+// GPL unshifted (sf=0): MAC base used as-is, no shift
+CESTER_TEST(gpl_unshifted, gte_tests,
+    cop2_put(25, 100);
+    cop2_put(26, 200);
+    cop2_put(27, 300);
+    cop2_put(8, 3);  // IR0 = 3
+    cop2_put(9, 10);
+    cop2_put(10, 20);
+    cop2_put(11, 30);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPL(0, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // sf=0: MAC_new = MAC_old + IR0*IR = 100+30=130, 200+60=260, 300+90=390
+    cester_assert_int_eq(130, mac1);
+    cester_assert_int_eq(260, mac2);
+    cester_assert_int_eq(390, mac3);
+)
+
+// GPL pushes color FIFO
+CESTER_TEST(gpl_color_fifo, gte_tests,
+    cop2_put(25, 0);
+    cop2_put(26, 0);
+    cop2_put(27, 0);
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0x0ff0);  // /16 = 255
+    cop2_put(10, 0x0800); // /16 = 128
+    cop2_put(11, 0x0010); // /16 = 1
+    cop2_put(6, 0x55000000);  // CODE=0x55
+    gte_clear_flag();
+    cop2_cmd(COP2_GPL(1, 0));
+    uint32_t rgb2;
+    cop2_get(22, rgb2);
+    uint8_t r = rgb2 & 0xff;
+    uint8_t g = (rgb2 >> 8) & 0xff;
+    uint8_t b = (rgb2 >> 16) & 0xff;
+    uint8_t cd = (rgb2 >> 24) & 0xff;
+    cester_assert_uint_eq(0x55, cd);
+    cester_assert_uint_eq(255, r);
+    cester_assert_uint_eq(128, g);
+    cester_assert_uint_eq(1, b);
+)
diff --git a/src/mips/tests/gte/gte-lighting.c b/src/mips/tests/gte/gte-lighting.c
new file mode 100644
index 000000000..a23448f77
--- /dev/null
+++ b/src/mips/tests/gte/gte-lighting.c
@@ -0,0 +1,326 @@
+// Lighting instructions: NCS, NCT, NCCS, NCCT, NCDS, NCDT, CC, CDP
+
+// NCS: normal color single (2-stage: normal->light, light->color)
+CESTER_TEST(ncs_z_normal_white_light, gte_tests,
+    gte_set_simple_light();       // L33=0x1000
+    gte_set_white_light_color();  // LC identity
+    gte_set_zero_bk();
+    // Normal pointing at light: (0, 0, 0x1000)
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 0x1000);
+    cop2_put(6, 0x00808080);  // RGBC (not used by NCS but CODE is)
+    gte_clear_flag();
+    cop2_cmd(COP2_NCS(1, 1));
+    int32_t mac1, mac2, mac3;
+    uint32_t rgb2;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("NCS z-normal: MAC=(%d,%d,%d) RGB2=0x%08x\n", mac1, mac2, mac3, rgb2);
+    // Stage 1: L * normal = (0,0,0x1000).(0,0,0x1000) = only IR3 = 0x1000
+    // Stage 2: LC * (0,0,0x1000) + BK = (0,0,0x1000) since LC is identity, BK=0
+    // Color FIFO: MAC/16 = 0x1000/16 = 256 -> saturates to 255
+    cester_assert_int_eq(0, mac1);
+    cester_assert_int_eq(0, mac2);
+    cester_assert_int_eq(4096, mac3);
+    cester_assert_uint_eq(0x00ff0000, rgb2);
+)
+
+// NCS with background color
+CESTER_TEST(ncs_with_background, gte_tests,
+    gte_set_simple_light();
+    gte_set_white_light_color();
+    cop2_putc(13, 0x800);  // RBK = 0x800
+    cop2_putc(14, 0x400);  // GBK = 0x400
+    cop2_putc(15, 0x200);  // BBK = 0x200
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 0x1000);
+    cop2_put(6, 0x00000000);
+    gte_clear_flag();
+    cop2_cmd(COP2_NCS(1, 1));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // Stage 1: IR = (0, 0, 0x1000)
+    // Stage 2: MAC = BK + LC*(0,0,0x1000) = (0x800+0, 0x400+0, 0x200+0x1000)
+    cester_assert_int_eq(0x800, mac1);
+    cester_assert_int_eq(0x400, mac2);
+    cester_assert_int_eq(0x1200, mac3);
+)
+
+// NCT: normal color triple
+CESTER_TEST(nct_three_normals, gte_tests,
+    gte_set_simple_light();
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    // V0 = (0, 0, 0x1000) - facing light
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 0x1000);
+    // V1 = (0x1000, 0, 0) - perpendicular
+    cop2_put(2, (0 << 16) | 0x1000);
+    cop2_put(3, 0);
+    // V2 = (0, 0x1000, 0) - perpendicular
+    cop2_put(4, (0x1000 << 16) | 0);
+    cop2_put(5, 0);
+    cop2_put(6, 0x00000000);
+    gte_clear_flag();
+    cop2_cmd(COP2_NCT(1, 1));
+    uint32_t rgb0, rgb1, rgb2;
+    cop2_get(20, rgb0);
+    cop2_get(21, rgb1);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("NCT: RGB0=0x%08x RGB1=0x%08x RGB2=0x%08x\n", rgb0, rgb1, rgb2);
+    // V0 facing light: should have color
+    // V1, V2 perpendicular: should be dark (light only in Z)
+    cester_assert_uint_eq(0x00ff0000, rgb0);
+    cester_assert_uint_eq(0x00000000, rgb1);
+    cester_assert_uint_eq(0x00000000, rgb2);
+)
+
+// NCCS: normal color color single (adds vertex color multiplication)
+CESTER_TEST(nccs_basic, gte_tests,
+    gte_set_simple_light();
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 0x1000);
+    cop2_put(6, 0x00808080);  // R=0x80, G=0x80, B=0x80
+    gte_clear_flag();
+    cop2_cmd(COP2_NCCS(1, 1));
+    int32_t mac1, mac2, mac3;
+    uint32_t rgb2;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("NCCS: MAC=(%d,%d,%d) RGB2=0x%08x\n", mac1, mac2, mac3, rgb2);
+    // Stage 1: IR = (0, 0, 0x1000)
+    // Stage 2: MAC = LC*(0,0,0x1000) = (0, 0, 0x1000)
+    // Stage 3: MAC = (R<<4)*IR = (0x80<<4)*0 for R,G; (0x80<<4)*0x1000 for B... wait
+    // Actually after stage 2, IR1=0, IR2=0, IR3=0x1000
+    // Stage 3: MAC1 = (R<<4)*IR1 = 0x800*0 = 0
+    // Only B channel gets lit since only IR3 is non-zero
+    cester_assert_int_eq(0, mac1);
+    cester_assert_int_eq(0, mac2);
+    cester_assert_int_eq(2048, mac3);
+    cester_assert_uint_eq(0x00800000, rgb2);
+)
+
+// NCCT: normal color color triple
+CESTER_TEST(ncct_basic, gte_tests,
+    gte_set_simple_light();
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 0x1000);
+    cop2_put(2, 0x00000000);
+    cop2_put(3, 0x1000);
+    cop2_put(4, 0x00000000);
+    cop2_put(5, 0x1000);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_NCCT(1, 1));
+    uint32_t rgb0, rgb1, rgb2;
+    cop2_get(20, rgb0);
+    cop2_get(21, rgb1);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("NCCT: RGB0=0x%08x RGB1=0x%08x RGB2=0x%08x\n", rgb0, rgb1, rgb2);
+    // All three normals identical -> all three results should match
+    cester_assert_uint_eq(0x00800000, rgb0);
+    cester_assert_uint_eq(0x00800000, rgb1);
+    cester_assert_uint_eq(0x00800000, rgb2);
+)
+
+// NCDS: normal color depth single (full 3-stage pipeline + depth cue)
+CESTER_TEST(ncds_no_depth, gte_tests,
+    gte_set_simple_light();
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    gte_set_far_color(0, 0, 0);
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 0x1000);
+    cop2_put(6, 0x00808080);
+    cop2_put(8, 0);  // IR0 = 0 (no depth cue)
+    gte_clear_flag();
+    cop2_cmd(COP2_NCDS(1, 1));
+    int32_t mac1, mac2, mac3;
+    uint32_t rgb2;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("NCDS no depth: MAC=(%d,%d,%d) RGB2=0x%08x\n", mac1, mac2, mac3, rgb2);
+    cester_assert_int_eq(0, mac1);
+    cester_assert_int_eq(0, mac2);
+    cester_assert_int_eq(2048, mac3);
+    cester_assert_uint_eq(0x00800000, rgb2);
+)
+
+// NCDS with depth cue
+CESTER_TEST(ncds_with_depth, gte_tests,
+    gte_set_simple_light();
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    gte_set_far_color(0x1000, 0x1000, 0x1000);
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 0x1000);
+    cop2_put(6, 0x00808080);
+    cop2_put(8, 0x0800);  // IR0 = 0.5
+    gte_clear_flag();
+    cop2_cmd(COP2_NCDS(1, 1));
+    int32_t mac1, mac2, mac3;
+    uint32_t rgb2, flag;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cop2_get(22, rgb2);
+    flag = gte_read_flag();
+    ramsyscall_printf("NCDS depth: MAC=(%d,%d,%d) RGB2=0x%08x FLAG=0x%08x\n",
+                      mac1, mac2, mac3, rgb2, flag);
+    cester_assert_int_eq(2048, mac1);
+    cester_assert_int_eq(2048, mac2);
+    cester_assert_int_eq(3072, mac3);
+    cester_assert_uint_eq(0x00c08080, rgb2);
+    cester_assert_uint_eq(0x00000000, flag);
+)
+
+// NCDT: normal color depth triple
+CESTER_TEST(ncdt_basic, gte_tests,
+    gte_set_simple_light();
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    gte_set_far_color(0, 0, 0);
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 0x1000);
+    cop2_put(2, 0x00000000);
+    cop2_put(3, 0x0800);
+    cop2_put(4, 0x00000000);
+    cop2_put(5, 0x0400);
+    cop2_put(6, 0x00808080);
+    cop2_put(8, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_NCDT(1, 1));
+    uint32_t rgb0, rgb1, rgb2;
+    cop2_get(20, rgb0);
+    cop2_get(21, rgb1);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("NCDT: RGB0=0x%08x RGB1=0x%08x RGB2=0x%08x\n", rgb0, rgb1, rgb2);
+    // V0 has strongest light (normal = 0x1000), V2 weakest (0x400)
+    cester_assert_uint_eq(0x00800000, rgb0);
+    cester_assert_uint_eq(0x00400000, rgb1);
+    cester_assert_uint_eq(0x00200000, rgb2);
+)
+
+// CC: color color (light-to-color + vertex color multiply)
+CESTER_TEST(cc_basic, gte_tests,
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    // Pre-computed light intensity in IR1-3
+    cop2_put(9, 0x1000);
+    cop2_put(10, 0x0800);
+    cop2_put(11, 0x0400);
+    cop2_put(6, 0x00808080);  // RGBC
+    gte_clear_flag();
+    cop2_cmd(COP2_CC(1, 1));
+    int32_t mac1, mac2, mac3;
+    uint32_t rgb2;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("CC: MAC=(%d,%d,%d) RGB2=0x%08x\n", mac1, mac2, mac3, rgb2);
+    // Stage 1 (light to color): with white LC identity and zero BK,
+    // MAC = LC*IR = IR (identity)
+    // Stage 2 (color mult): MAC = (R<<4)*IR1 = 0x800*0x1000 = 0x800000
+    // After >>12 = 0x800, /16 = 128
+    cester_assert_int_eq(2048, mac1);
+    cester_assert_int_eq(1024, mac2);
+    cester_assert_int_eq(512, mac3);
+    cester_assert_uint_eq(0x00204080, rgb2);
+)
+
+// CDP: color depth cue with pre-computed light
+CESTER_TEST(cdp_basic, gte_tests,
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    gte_set_far_color(0x1000, 0x1000, 0x1000);
+    cop2_put(9, 0x1000);
+    cop2_put(10, 0x1000);
+    cop2_put(11, 0x1000);
+    cop2_put(6, 0x00808080);
+    cop2_put(8, 0);  // IR0=0: no depth cue
+    gte_clear_flag();
+    cop2_cmd(COP2_CDP(1, 1));
+    int32_t mac1, mac2, mac3;
+    uint32_t rgb2;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cop2_get(22, rgb2);
+    ramsyscall_printf("CDP: MAC=(%d,%d,%d) RGB2=0x%08x\n", mac1, mac2, mac3, rgb2);
+    cester_assert_int_eq(2048, mac1);
+    cester_assert_int_eq(2048, mac2);
+    cester_assert_int_eq(2048, mac3);
+    cester_assert_uint_eq(0x00808080, rgb2);
+)
+
+// CDP with depth cue
+CESTER_TEST(cdp_with_depth, gte_tests,
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    gte_set_far_color(0x1000, 0x1000, 0x1000);
+    cop2_put(9, 0x1000);
+    cop2_put(10, 0x1000);
+    cop2_put(11, 0x1000);
+    cop2_put(6, 0x00808080);
+    cop2_put(8, 0x0800);  // IR0=0.5
+    gte_clear_flag();
+    cop2_cmd(COP2_CDP(1, 1));
+    int32_t mac1, mac2, mac3;
+    uint32_t rgb2, flag;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cop2_get(22, rgb2);
+    flag = gte_read_flag();
+    ramsyscall_printf("CDP depth: MAC=(%d,%d,%d) RGB2=0x%08x FLAG=0x%08x\n",
+                      mac1, mac2, mac3, rgb2, flag);
+    cester_assert_int_eq(3072, mac1);
+    cester_assert_int_eq(3072, mac2);
+    cester_assert_int_eq(3072, mac3);
+    cester_assert_uint_eq(0x00c0c0c0, rgb2);
+    cester_assert_uint_eq(0x00000000, flag);
+)
+
+// Full lighting pipeline: light matrix with non-trivial light direction
+CESTER_TEST(ncs_full_light_matrix, gte_tests,
+    // Light from (0.707, 0, 0.707) direction - 45 degrees
+    // In 4.12 fixed: 0.707 ~ 0x0B50
+    cop2_putc(8, 0x00000b50);   // L11=0x0B50, L12=0
+    cop2_putc(9, 0x00000000);   // L13=0, L21=0
+    cop2_putc(10, 0x00000000);  // L22=0, L23=0
+    cop2_putc(11, 0x00000000);  // L31=0, L32=0
+    cop2_putc(12, 0x0b50);      // L33=0x0B50
+    gte_set_white_light_color();
+    gte_set_zero_bk();
+    // Normal = (0x1000, 0, 0) - facing X
+    cop2_put(0, (0 << 16) | 0x1000);
+    cop2_put(1, 0);
+    cop2_put(6, 0x00000000);
+    gte_clear_flag();
+    cop2_cmd(COP2_NCS(1, 1));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    ramsyscall_printf("NCS 45deg: MAC=(%d,%d,%d)\n", mac1, mac2, mac3);
+    // Stage 1: L * normal = (L11*VX, 0, L31*VX) = (0x0B50*0x1000, 0, 0)
+    //   >> 12 = (0x0B50, 0, 0), so IR = (0x0B50, 0, 0)
+    // Stage 2: LC * IR = (0x0B50, 0, 0) since LC is identity, BK=0
+    // MAC1 = 0x0B50, MAC2 = 0, MAC3 = 0
+    cester_assert_int_eq(0x0b50, mac1);
+    cester_assert_int_eq(0, mac2);
+    cester_assert_int_eq(0, mac3);
+)
diff --git a/src/mips/tests/gte/gte-mvmva.c b/src/mips/tests/gte/gte-mvmva.c
new file mode 100644
index 000000000..a7714667a
--- /dev/null
+++ b/src/mips/tests/gte/gte-mvmva.c
@@ -0,0 +1,203 @@
+// MVMVA: parameterized matrix-vector multiply and add
+
+// mx=RT, v=V0, cv=TR (standard transform)
+CESTER_TEST(mvmva_rt_v0_tr, gte_tests,
+    // 90-degree Z rotation
+    cop2_putc(0, 0xf0000000);  // R11=0, R12=-0x1000
+    cop2_putc(1, 0x10000000);  // R13=0, R21=0x1000
+    cop2_putc(2, 0x00000000);
+    cop2_putc(3, 0x00000000);
+    cop2_putc(4, 0x1000);
+    gte_set_translation(10, 20, 30);
+    cop2_put(0, (200 << 16) | 100);
+    cop2_put(1, 300);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, 0, 0, 0, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cester_assert_int_eq(-190, mac1);
+    cester_assert_int_eq(120, mac2);
+    cester_assert_int_eq(330, mac3);
+)
+
+// mx=RT, v=V1, cv=Zero
+CESTER_TEST(mvmva_rt_v1_zero, gte_tests,
+    gte_set_identity_rotation();
+    cop2_put(2, (40 << 16) | 30);  // V1 = (30, 40)
+    cop2_put(3, 50);                // V1.Z = 50
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, 0, 1, 3, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cester_assert_int_eq(30, mac1);
+    cester_assert_int_eq(40, mac2);
+    cester_assert_int_eq(50, mac3);
+)
+
+// mx=RT, v=V2, cv=BK
+CESTER_TEST(mvmva_rt_v2_bk, gte_tests,
+    gte_set_identity_rotation();
+    cop2_putc(13, 1000);  // RBK
+    cop2_putc(14, 2000);  // GBK
+    cop2_putc(15, 3000);  // BBK
+    cop2_put(4, (200 << 16) | 100);  // V2
+    cop2_put(5, 300);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, 0, 2, 1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cester_assert_int_eq(1100, mac1);
+    cester_assert_int_eq(2200, mac2);
+    cester_assert_int_eq(3300, mac3);
+)
+
+// mx=RT, v=IR, cv=Zero
+CESTER_TEST(mvmva_rt_ir_zero, gte_tests,
+    gte_set_identity_rotation();
+    cop2_put(9, 500);
+    cop2_put(10, 600);
+    cop2_put(11, 700);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, 0, 3, 3, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cester_assert_int_eq(500, mac1);
+    cester_assert_int_eq(600, mac2);
+    cester_assert_int_eq(700, mac3);
+)
+
+// mx=LL (light matrix), v=V0, cv=Zero
+CESTER_TEST(mvmva_ll_v0_zero, gte_tests,
+    gte_set_simple_light();  // L33=0x1000, rest zero
+    cop2_put(0, (200 << 16) | 100);
+    cop2_put(1, 0x1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, 1, 0, 3, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // Only L33 is non-zero, so MAC3 = L33*VZ0 >> 12 = 0x1000 * 0x1000 >> 12 = 0x1000
+    cester_assert_int_eq(0, mac1);
+    cester_assert_int_eq(0, mac2);
+    cester_assert_int_eq(0x1000, mac3);
+)
+
+// mx=LC (light color), v=IR, cv=BK
+CESTER_TEST(mvmva_lc_ir_bk, gte_tests,
+    gte_set_white_light_color();
+    cop2_putc(13, 100);  // RBK
+    cop2_putc(14, 200);  // GBK
+    cop2_putc(15, 300);  // BBK
+    cop2_put(9, 0x1000);
+    cop2_put(10, 0x1000);
+    cop2_put(11, 0x1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, 2, 3, 1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // White LC identity: MAC = (BK<<12 + LR1*IR1) >> 12 = BK + IR
+    // BK = (100, 200, 300), IR = (0x1000, 0x1000, 0x1000) = (4096, 4096, 4096)
+    // MAC1 = 100 + 4096 = 4196, etc.
+    cester_assert_int_eq(4196, mac1);
+    cester_assert_int_eq(4296, mac2);
+    cester_assert_int_eq(4396, mac3);
+)
+
+// cv=2 (far color) bug
+CESTER_TEST(mvmva_cv2_fc_bug, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_far_color(0x1000, 0x2000, 0x3000);
+    cop2_put(0, (0x200 << 16) | 0x100);
+    cop2_put(1, 0x300);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, 0, 0, 2, 0));
+    int32_t mac1, mac2, mac3;
+    uint32_t flag;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    flag = gte_read_flag();
+    // Buggy: result is partial - only last column (R13*VZ, R23*VZ, R33*VZ)
+    // With identity: R13=0, R23=0, R33=0x1000
+    // MAC1 = R13*VZ >> 12 = 0
+    // MAC2 = R23*VZ >> 12 = 0 (but VY contribution leaks? Let's check)
+    // MAC3 = R33*VZ >> 12 = 0x300
+    ramsyscall_printf("MVMVA cv=2: MAC=(%d,%d,%d) FLAG=0x%08x\n", mac1, mac2, mac3, flag);
+    cester_assert_int_eq(0, mac1);
+    cester_assert_int_eq(512, mac2);
+    cester_assert_int_eq(768, mac3);
+    cester_assert_uint_eq(0x00000000, flag);
+)
+
+// mx=3 (garbage matrix)
+CESTER_TEST(mvmva_mx3_garbage, gte_tests,
+    cop2_putc(0, 0x20001000);  // R11=0x1000, R12=0x2000
+    cop2_putc(1, 0x40003000);  // R13=0x3000, R21=0x4000
+    cop2_putc(2, 0x60005000);  // R22=0x5000, R23=0x6000
+    cop2_putc(3, 0x80007000);  // R31=0x7000, R32=-0x8000
+    cop2_putc(4, 0x1000);
+    cop2_put(8, 0x0800);  // IR0
+    cop2_put(0, (0x100 << 16) | 0x100);
+    cop2_put(1, 0x100);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(1, 3, 0, 3, 0));
+    int32_t mac1, mac2, mac3;
+    uint32_t flag;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    flag = gte_read_flag();
+    ramsyscall_printf("MVMVA mx=3: MAC=(%d,%d,%d) FLAG=0x%08x\n", mac1, mac2, mac3, flag);
+    cester_assert_int_eq(128, mac1);
+    cester_assert_int_eq(2304, mac2);
+    cester_assert_int_eq(3840, mac3);
+    cester_assert_uint_eq(0x00000000, flag);
+)
+
+// MVMVA with lm=1
+CESTER_TEST(mvmva_lm1, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(-500, -600, -700);
+    cop2_put(0, (100 << 16) | 100);
+    cop2_put(1, 100);
+    gte_clear_flag();
+    // sf=1, mx=RT, v=V0, cv=TR, lm=1
+    cop2_cmd(COP2_MVMVA(1, 0, 0, 0, 1));
+    int32_t mac1;
+    uint32_t ir1;
+    cop2_get(25, mac1);
+    cop2_get(9, ir1);
+    // MAC1 = 100 + (-500) = -400
+    cester_assert_int_eq(-400, mac1);
+    // IR1 with lm=1: clamped to [0, 0x7fff], so -400 -> 0
+    cester_assert_uint_eq(0, ir1);
+)
+
+// MVMVA sf=0 (no shift)
+CESTER_TEST(mvmva_sf0, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_put(0, (10 << 16) | 10);
+    cop2_put(1, 10);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(0, 0, 0, 3, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // sf=0: no >>12 shift. MAC = R * V = 0x1000 * 10 = 40960
+    cester_assert_int_eq(40960, mac1);
+    cester_assert_int_eq(40960, mac2);
+    cester_assert_int_eq(40960, mac3);
+)
diff --git a/src/mips/tests/gte/gte-nclip.c b/src/mips/tests/gte/gte-nclip.c
new file mode 100644
index 000000000..b6f6e4f7b
--- /dev/null
+++ b/src/mips/tests/gte/gte-nclip.c
@@ -0,0 +1,84 @@
+// NCLIP: normal clipping (screen-space triangle winding / area)
+// MAC0 = SX0*(SY1-SY2) + SX1*(SY2-SY0) + SX2*(SY0-SY1)
+
+CESTER_TEST(nclip_ccw, gte_tests,
+    cop2_put(12, 0x00000000);  // (0,0)
+    cop2_put(13, 0x00000064);  // (100,0)
+    cop2_put(14, 0x00640000);  // (0,100)
+    gte_clear_flag();
+    cop2_cmd(COP2_NCLIP);
+    int32_t mac0;
+    cop2_get(24, mac0);
+    cester_assert_int_eq(10000, mac0);
+    cester_assert_uint_eq(0, gte_read_flag());
+)
+
+CESTER_TEST(nclip_cw, gte_tests,
+    cop2_put(12, 0x00000000);
+    cop2_put(13, 0x00640000);  // (0,100)
+    cop2_put(14, 0x00000064);  // (100,0)
+    gte_clear_flag();
+    cop2_cmd(COP2_NCLIP);
+    int32_t mac0;
+    cop2_get(24, mac0);
+    cester_assert_int_eq(-10000, mac0);
+)
+
+CESTER_TEST(nclip_collinear, gte_tests,
+    cop2_put(12, 0x00000000);
+    cop2_put(13, 0x00320032);  // (50,50)
+    cop2_put(14, 0x00640064);  // (100,100)
+    gte_clear_flag();
+    cop2_cmd(COP2_NCLIP);
+    int32_t mac0;
+    cop2_get(24, mac0);
+    cester_assert_int_eq(0, mac0);
+)
+
+// NCLIP with large screen coords near saturation limits
+CESTER_TEST(nclip_large_coords, gte_tests,
+    // SXY values near the screen coord limits (-0x400..0x3FF)
+    cop2_put(12, (0xfc00 << 16) | 0x03ff);  // (0x3FF, -0x400)
+    cop2_put(13, (0x03ff << 16) | 0xfc00);  // (-0x400, 0x3FF)
+    cop2_put(14, 0x00000000);                // (0, 0)
+    gte_clear_flag();
+    cop2_cmd(COP2_NCLIP);
+    int32_t mac0;
+    uint32_t flag;
+    cop2_get(24, mac0);
+    flag = gte_read_flag();
+    // (0x3FF * 0x3FF) + (-0x400 * 0) + (0 * (-0x400))
+    // - (0x3FF * 0) - (-0x400 * (-0x400)) - (0 * 0x3FF)
+    // = 0x3FF*0x3FF - 0x400*0x400 = 1046529 - 1048576 = -2047
+    // Actually: SX0=0x3FF, SY0=-0x400, SX1=-0x400, SY1=0x3FF, SX2=0, SY2=0
+    // MAC0 = SX0*(SY1-SY2) + SX1*(SY2-SY0) + SX2*(SY0-SY1)
+    //      = 0x3FF*(0x3FF-0) + (-0x400)*(0-(-0x400)) + 0*((-0x400)-0x3FF)
+    //      = 0x3FF*0x3FF + (-0x400)*0x400
+    //      = 1046529 - 1048576 = -2047
+    ramsyscall_printf("NCLIP large: MAC0=%d FLAG=0x%08x\n", mac0, flag);
+    cester_assert_int_eq(-2047, mac0);
+    cester_assert_uint_eq(0, flag);
+)
+
+// NCLIP MAC0 overflow: maximum possible cross product
+CESTER_TEST(nclip_overflow, gte_tests,
+    // Use values that produce MAC0 > 0x7FFFFFFF
+    // Max SX/SY after saturation is -0x400..0x3FF (11-bit signed)
+    // Max cross product: 0x3FF*0x3FF*2 + 0x400*0x400*2 ~ 4 million, no overflow
+    // Need unsaturated values: SXY registers are 16-bit signed
+    cop2_put(12, (0x7fff << 16) | 0x7fff);  // (32767, 32767)
+    cop2_put(13, (0x8000 << 16) | 0x8000);  // (-32768, -32768)
+    cop2_put(14, (0x7fff << 16) | 0x8000);  // (-32768, 32767)
+    gte_clear_flag();
+    cop2_cmd(COP2_NCLIP);
+    int32_t mac0;
+    uint32_t flag;
+    cop2_get(24, mac0);
+    flag = gte_read_flag();
+    ramsyscall_printf("NCLIP overflow: MAC0=%d FLAG=0x%08x\n", mac0, flag);
+    // Check if FLAG.16 or FLAG.15 (MAC0 overflow) is set
+    ramsyscall_printf("  FLAG.16=%u FLAG.15=%u\n", (flag >> 16) & 1, (flag >> 15) & 1);
+    cester_assert_int_eq(131071, mac0);
+    uint32_t f15 = (flag >> 15) & 1;
+    cester_assert_uint_eq(1, f15);
+)
diff --git a/src/mips/tests/gte/gte-op.c b/src/mips/tests/gte/gte-op.c
new file mode 100644
index 000000000..d861b0f34
--- /dev/null
+++ b/src/mips/tests/gte/gte-op.c
@@ -0,0 +1,82 @@
+// OP: outer product / cross product
+// Uses rotation matrix diagonal (R11, R22, R33) as D vector
+// Result = D x IR
+
+CESTER_TEST(op_identity_diagonal, gte_tests,
+    gte_set_identity_rotation();
+    cop2_put(9, 1000);
+    cop2_put(10, 2000);
+    cop2_put(11, 3000);
+    gte_clear_flag();
+    cop2_cmd(COP2_OP_CP(1, 0));
+    int32_t ir1, ir2, ir3;
+    cop2_get(9, ir1);
+    cop2_get(10, ir2);
+    cop2_get(11, ir3);
+    // D=(1,1,1), IR=(1000,2000,3000)
+    // cross = (1*3000-1*2000, 1*1000-1*3000, 1*2000-1*1000) = (1000,-2000,1000)
+    cester_assert_int_eq(1000, ir1);
+    cester_assert_int_eq(-2000, ir2);
+    cester_assert_int_eq(1000, ir3);
+)
+
+CESTER_TEST(op_unshifted, gte_tests,
+    gte_set_identity_rotation();
+    cop2_put(9, 10);
+    cop2_put(10, 20);
+    cop2_put(11, 30);
+    gte_clear_flag();
+    cop2_cmd(COP2_OP_CP(0, 0));  // sf=0
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // sf=0: no shift. D=(0x1000,0x1000,0x1000), IR=(10,20,30)
+    // MAC1 = R22*IR3 - R33*IR2 = 0x1000*30 - 0x1000*20 = 4096*(30-20) = 40960
+    cester_assert_int_eq(40960, mac1);
+    cester_assert_int_eq(-81920, mac2);
+    cester_assert_int_eq(40960, mac3);
+)
+
+// OP with asymmetric diagonal
+CESTER_TEST(op_asymmetric, gte_tests,
+    cop2_putc(0, 0x00000800);  // R11=0x800 (0.5)
+    cop2_putc(1, 0x00000000);
+    cop2_putc(2, 0x00001000);  // R22=0x1000 (1.0)
+    cop2_putc(3, 0x00000000);
+    cop2_putc(4, 0x2000);      // R33=0x2000 (2.0)
+    cop2_put(9, 100);
+    cop2_put(10, 200);
+    cop2_put(11, 300);
+    gte_clear_flag();
+    cop2_cmd(COP2_OP_CP(1, 0));
+    int32_t ir1, ir2, ir3;
+    cop2_get(9, ir1);
+    cop2_get(10, ir2);
+    cop2_get(11, ir3);
+    // D=(0.5, 1.0, 2.0), IR=(100,200,300)
+    // cross.x = D.y*IR.z - D.z*IR.y = 1.0*300 - 2.0*200 = 300 - 400 = -100
+    // cross.y = D.z*IR.x - D.x*IR.z = 2.0*100 - 0.5*300 = 200 - 150 = 50
+    // cross.z = D.x*IR.y - D.y*IR.x = 0.5*200 - 1.0*100 = 100 - 100 = 0
+    cester_assert_int_eq(-100, ir1);
+    cester_assert_int_eq(50, ir2);
+    cester_assert_int_eq(0, ir3);
+)
+
+// OP with overflow - large values that exceed 44-bit accumulator
+CESTER_TEST(op_overflow_flag, gte_tests,
+    cop2_putc(0, 0x00007fff);  // R11=0x7fff
+    cop2_putc(2, 0x00007fff);  // R22=0x7fff
+    cop2_putc(4, 0x7fff);      // R33=0x7fff
+    cop2_put(9, 0x7fff);
+    cop2_put(10, 0x7fff);
+    cop2_put(11, 0x7fff);
+    gte_clear_flag();
+    cop2_cmd(COP2_OP_CP(0, 0));  // sf=0, no shift -> large products
+    uint32_t flag;
+    flag = gte_read_flag();
+    ramsyscall_printf("OP overflow: FLAG=0x%08x\n", flag);
+    // With sf=0: MAC = 0x7fff*0x7fff - 0x7fff*0x7fff = 0 for all
+    // Actually this produces zero cross product since all components are equal
+    cester_assert_uint_eq(0x00000000, flag);
+)
diff --git a/src/mips/tests/gte/gte-precision.c b/src/mips/tests/gte/gte-precision.c
new file mode 100644
index 000000000..c5001f8fe
--- /dev/null
+++ b/src/mips/tests/gte/gte-precision.c
@@ -0,0 +1,364 @@
+// Precision tests: 44-bit MAC overflow detection, division table
+// spot-checks, RTPS IR3/FLAG.22 sf=0 anomaly.
+// These target the exact behaviors that cause subtle game glitches
+// when emulated imprecisely.
+
+// ==========================================================================
+// 44-bit MAC overflow detection (FLAG bits 25-30)
+// ==========================================================================
+// The GTE accumulator is 44 bits wide. Overflow is detected per-addition
+// in the chain, not on the final result. Two overflows that cancel out
+// will still both be flagged.
+
+// MAC1 positive overflow (FLAG.30): product exceeds +0x7FFFFFFFFFF
+CESTER_TEST(prec_mac1_positive_overflow, gte_tests,
+    // MVMVA with large matrix and large vector, sf=0 (no shift)
+    // R11=0x7FFF, V0.X=0x7FFF -> R11*VX = 0x3FFF0001
+    // With TR=0x7FFFFFFF and sf=0: TRX<<12 + R11*VX + R12*VY + R13*VZ
+    // TRX<<12 = 0x7FFFFFFF000 (43 bits) + 0x3FFF0001 = overflows 44-bit
+    cop2_putc(0, 0x00007fff);  // R11=0x7FFF, R12=0
+    cop2_putc(1, 0x00000000);
+    cop2_putc(2, 0x00000000);
+    cop2_putc(3, 0x00000000);
+    cop2_putc(4, 0);
+    cop2_putc(5, 0x7fffffff);  // TRX = max positive 32-bit
+    cop2_putc(6, 0);
+    cop2_putc(7, 0);
+    cop2_put(0, (0 << 16) | 0x7fff);  // VX=0x7FFF, VY=0
+    cop2_put(1, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(0, COP2_MX_RT, COP2_V_V0, COP2_CV_TR, 0));
+    uint32_t flag = gte_read_flag();
+    uint32_t f30 = (flag >> 30) & 1;
+    ramsyscall_printf("MAC1 pos overflow: FLAG=0x%08x F30=%u\n", flag, f30);
+    cester_assert_uint_eq(1, f30);
+)
+
+// MAC1 negative overflow (FLAG.27)
+CESTER_TEST(prec_mac1_negative_overflow, gte_tests,
+    cop2_putc(0, 0x00007fff);  // R11=0x7FFF
+    cop2_putc(1, 0x00000000);
+    cop2_putc(2, 0x00000000);
+    cop2_putc(3, 0x00000000);
+    cop2_putc(4, 0);
+    cop2_putc(5, 0x80000000);  // TRX = min negative 32-bit
+    cop2_putc(6, 0);
+    cop2_putc(7, 0);
+    cop2_put(0, (0 << 16) | 0x8000);  // VX=-0x8000 (negative)
+    cop2_put(1, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(0, COP2_MX_RT, COP2_V_V0, COP2_CV_TR, 0));
+    uint32_t flag = gte_read_flag();
+    uint32_t f27 = (flag >> 27) & 1;
+    ramsyscall_printf("MAC1 neg overflow: FLAG=0x%08x F27=%u\n", flag, f27);
+    cester_assert_uint_eq(1, f27);
+)
+
+// MAC2 overflow (FLAG.29 positive, FLAG.26 negative)
+CESTER_TEST(prec_mac2_overflow, gte_tests,
+    cop2_putc(0, 0x00000000);
+    cop2_putc(1, 0x7fff0000);  // R21=0x7FFF (high16 of R13R21), R13=0
+    cop2_putc(2, 0x00000000);
+    cop2_putc(3, 0x00000000);
+    cop2_putc(4, 0);
+    cop2_putc(5, 0);
+    cop2_putc(6, 0x7fffffff);  // TRY = max
+    cop2_putc(7, 0);
+    cop2_put(0, (0 << 16) | 0x7fff);
+    cop2_put(1, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(0, COP2_MX_RT, COP2_V_V0, COP2_CV_TR, 0));
+    uint32_t flag = gte_read_flag();
+    uint32_t f29 = (flag >> 29) & 1;
+    ramsyscall_printf("MAC2 pos overflow: FLAG=0x%08x F29=%u\n", flag, f29);
+    cester_assert_uint_eq(1, f29);
+)
+
+// MAC3 overflow (FLAG.28 positive, FLAG.25 negative)
+CESTER_TEST(prec_mac3_overflow, gte_tests,
+    cop2_putc(0, 0x00000000);
+    cop2_putc(1, 0x00000000);
+    cop2_putc(2, 0x00000000);
+    cop2_putc(3, 0x00007fff);  // R31=0x7FFF (high16 of R31R32)
+    cop2_putc(4, 0);
+    cop2_putc(5, 0);
+    cop2_putc(6, 0);
+    cop2_putc(7, 0x7fffffff);  // TRZ = max
+    cop2_put(0, (0 << 16) | 0x7fff);
+    cop2_put(1, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_MVMVA(0, COP2_MX_RT, COP2_V_V0, COP2_CV_TR, 0));
+    uint32_t flag = gte_read_flag();
+    uint32_t f28 = (flag >> 28) & 1;
+    ramsyscall_printf("MAC3 pos overflow: FLAG=0x%08x F28=%u\n", flag, f28);
+    cester_assert_uint_eq(1, f28);
+)
+
+// Two overflows that cancel: both positive and negative overflow
+// should be flagged even if the final result is in range
+CESTER_TEST(prec_mac_double_overflow, gte_tests,
+    // Use OP (cross product) sf=0 with values that cause intermediate
+    // overflow in both directions during the subtract
+    // MAC1 = R22*IR3 - R33*IR2
+    // Make R22*IR3 overflow positive, then R33*IR2 brings it back
+    cop2_putc(0, 0x00000000);
+    cop2_putc(2, 0x00007fff);  // R22=0x7FFF
+    cop2_putc(4, 0x7fff);      // R33=0x7FFF
+    cop2_put(9, 0);
+    cop2_put(10, 0x7fff);  // IR2
+    cop2_put(11, 0x7fff);  // IR3
+    gte_clear_flag();
+    cop2_cmd(COP2_OP_CP(0, 0));  // sf=0
+    int32_t mac1;
+    uint32_t flag;
+    cop2_get(25, mac1);
+    flag = gte_read_flag();
+    ramsyscall_printf("double overflow: MAC1=%d FLAG=0x%08x\n", mac1, flag);
+    // R22*IR3 = 0x7FFF*0x7FFF = 0x3FFF0001 (fits in 44-bit)
+    // Then subtract R33*IR2 = 0x7FFF*0x7FFF = 0x3FFF0001
+    // Result = 0, but check if intermediate overflow flagged
+    cester_assert_int_eq(0, mac1);
+    cester_assert_uint_eq(0, flag);
+)
+
+// ==========================================================================
+// Division table spot-checks
+// ==========================================================================
+// The UNR table has 257 entries. Test specific H/SZ3 pairs that exercise
+// known table entries and verify exact quotients.
+
+// Helper: run RTPS with given H and SZ3 (via VZ), return quotient via SX
+// Uses VX=0x1000, OFX=0 so SX = VX * (H/SZ3) = 0x1000 * quotient >> 16
+// Actually simpler: set IR1=0x1000 before RTPS, read MAC0 for DQA path,
+// or just check SX directly.
+
+// H/SZ3 = 1/1: quotient should be near 0x10000 (1.0 in 0.16 fixed)
+CESTER_TEST(prec_div_1_over_1, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0);  // OFX=0
+    cop2_putc(25, 0);
+    cop2_putc(26, 1);  // H=1
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    cop2_put(0, (0 << 16) | 0x1000);  // VX=0x1000, VY=0
+    cop2_put(1, 1);  // VZ=1 -> SZ3=1
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2, flag;
+    cop2_get(14, sxy2);
+    flag = gte_read_flag();
+    int16_t sx = (int16_t)(sxy2 & 0xffff);
+    ramsyscall_printf("div 1/1: SX=%d FLAG=0x%08x\n", sx, flag);
+    // H=1, SZ3=1 -> H >= SZ3*2? 1 >= 2? No -> no overflow
+    // quotient = H*0x20000/SZ3 = 0x20000. Saturated to 0x1FFFF.
+    // SX = IR1 * quotient >> 16 = 0x1000 * 0x1FFFF >> 16 = 0x1FFF
+    // Then saturated to 0x3FF
+    uint32_t f17 = (flag >> 17) & 1;
+    cester_assert_uint_eq(0, f17);  // no division overflow
+)
+
+// H/SZ3 = 100/1000: quotient = 0.1 in fixed point
+CESTER_TEST(prec_div_100_over_1000, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0);
+    cop2_putc(25, 0);
+    cop2_putc(26, 100);  // H=100
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    cop2_put(0, (0 << 16) | 1000);  // VX=1000
+    cop2_put(1, 1000);              // VZ=1000
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2;
+    cop2_get(14, sxy2);
+    int16_t sx = (int16_t)(sxy2 & 0xffff);
+    ramsyscall_printf("div 100/1000: SX=%d\n", sx);
+    // SX = 1000 * (100/1000) = 100 (roughly, depends on table rounding)
+    cester_assert_int_eq(100, sx);
+)
+
+// The documented corner case: H=0xF015, SZ3=0x780B -> 0x20000 saturates to 0x1FFFF
+CESTER_TEST(prec_div_corner_f015_780b, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0);
+    cop2_putc(25, 0);
+    cop2_putc(26, 0xf015);  // H
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    cop2_put(0, (0 << 16) | 1);  // VX=1 (minimal to see quotient effect)
+    cop2_put(1, 0x780b);         // VZ = 0x780B
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2, flag;
+    cop2_get(14, sxy2);
+    flag = gte_read_flag();
+    int16_t sx = (int16_t)(sxy2 & 0xffff);
+    ramsyscall_printf("div F015/780B: SX=%d FLAG=0x%08x\n", sx, flag);
+    // This should NOT set FLAG.17 (division overflow)
+    uint32_t f17 = (flag >> 17) & 1;
+    cester_assert_uint_eq(0, f17);
+)
+
+// Large H, small SZ3 (just under overflow): H=0xFFFE, SZ3=0x8000
+CESTER_TEST(prec_div_large_h, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0);
+    cop2_putc(25, 0);
+    cop2_putc(26, 0xfffe);  // H near max
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    cop2_put(0, (0 << 16) | 1);
+    cop2_put(1, 0x7fff);  // SZ3=0x7FFF -> H >= SZ3*2? 0xFFFE >= 0xFFFE -> yes, overflow
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t flag;
+    flag = gte_read_flag();
+    uint32_t f17 = (flag >> 17) & 1;
+    ramsyscall_printf("div large H: FLAG=0x%08x F17=%u\n", flag, f17);
+    cester_assert_uint_eq(1, f17);  // H >= SZ3*2 is true (equal counts)
+)
+
+// SZ3=1 with moderate H (quotient near max)
+CESTER_TEST(prec_div_sz3_one, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0);
+    cop2_putc(25, 0);
+    cop2_putc(26, 1);  // H=1
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    cop2_put(0, (0 << 16) | 1);
+    cop2_put(1, 1);  // SZ3=1
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2, flag;
+    int32_t ir1;
+    cop2_get(14, sxy2);
+    cop2_get(9, ir1);
+    flag = gte_read_flag();
+    int16_t sx = (int16_t)(sxy2 & 0xffff);
+    ramsyscall_printf("div SZ3=1: SX=%d IR1=%d FLAG=0x%08x\n", sx, ir1, flag);
+    // H/SZ3 = 1/1 -> quotient saturates to 0x1FFFF
+    // SX = IR1 * 0x1FFFF >> 16 = 1 * 0x1FFFF >> 16 = 1
+    cester_assert_int_eq(1, sx);
+)
+
+// ==========================================================================
+// RTPS IR3/FLAG.22 anomaly with sf=0
+// ==========================================================================
+// psx-spx: "When using RTP with sf=0, the IR3 saturation flag (FLAG.22)
+// gets set only if MAC3 SAR 12 exceeds -8000h..+7FFFh, although IR3 is
+// saturated when MAC3 exceeds -8000h..+7FFFh."
+//
+// Need MAC3 that is out of [-0x8000, 0x7FFF] range (so IR3 saturates)
+// but MAC3 >> 12 is in range (so FLAG.22 should NOT be set).
+
+CESTER_TEST(prec_rtps_sf0_ir3_flag_anomaly, gte_tests,
+    gte_set_identity_rotation();
+    // TRZ such that MAC3 is just over 0x7FFF but MAC3>>12 is in range
+    // With identity rotation and VZ=0: MAC3 = TRZ << 12 (sf=0, no shift)
+    // Wait - with sf=0 the formula is: MAC3 = TRZ*0x1000 + R3x*V
+    // Actually let's think more carefully.
+    // sf=0: A3 returns the raw 44-bit value without >>12
+    // MAC3 = TRZ<<12 + R31*VX + R32*VY + R33*VZ (no shift applied)
+    // With identity: MAC3 = TRZ<<12 + VZ*0x1000
+    // We want MAC3 > 0x7FFF (IR3 saturates) but MAC3>>12 in [-0x8000,0x7FFF]
+    // MAC3 = 0x8000 -> MAC3>>12 = 0 (in range) -> FLAG.22 NOT set but IR3 saturated
+    cop2_putc(5, 0);
+    cop2_putc(6, 0);
+    cop2_putc(7, 0);  // TRZ = 0
+    cop2_putc(24, 0);
+    cop2_putc(25, 0);
+    cop2_putc(26, 200);
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    // VZ = 8 -> MAC3 = 0 + 0x1000*8 = 0x8000 (just over 0x7FFF)
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 8);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(0, 0));  // sf=0
+    int32_t mac3;
+    uint32_t ir3, flag;
+    cop2_get(27, mac3);
+    cop2_get(11, ir3);
+    flag = gte_read_flag();
+    uint32_t f22 = (flag >> 22) & 1;
+    ramsyscall_printf("sf=0 anomaly: MAC3=%d IR3=0x%04x FLAG=0x%08x F22=%u\n",
+                      mac3, ir3 & 0xffff, flag, f22);
+    // MAC3 = 0x8000 -> out of [-0x8000, 0x7FFF] for IR3 (it equals -0x8000 boundary!)
+    // Hmm, 0x8000 = 32768 which is > 0x7FFF. IR3 should saturate to 0x7FFF.
+    // MAC3 >> 12 = 0x8000 >> 12 = 0 -> in range -> FLAG.22 should NOT be set.
+    // This is the anomaly: IR3 saturated but FLAG.22 not set.
+    cester_assert_int_eq(32768, mac3);
+    cester_assert_uint_eq(0x7fff, ir3);
+    cester_assert_uint_eq(0, f22);
+    uint32_t f17 = (flag >> 17) & 1;
+    cester_assert_uint_eq(1, f17);
+)
+
+// Stronger test: MAC3 = 0x10000 -> well above 0x7FFF, but >>12 = 1 (in range)
+CESTER_TEST(prec_rtps_sf0_ir3_flag_strong, gte_tests,
+    gte_set_identity_rotation();
+    cop2_putc(5, 0);
+    cop2_putc(6, 0);
+    cop2_putc(7, 0);
+    cop2_putc(24, 0);
+    cop2_putc(25, 0);
+    cop2_putc(26, 200);
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    // VZ = 16 -> MAC3 = 0x1000 * 16 = 0x10000 (65536, way above 0x7FFF)
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 16);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(0, 0));
+    int32_t mac3;
+    uint32_t ir3, flag;
+    cop2_get(27, mac3);
+    cop2_get(11, ir3);
+    flag = gte_read_flag();
+    uint32_t f22 = (flag >> 22) & 1;
+    ramsyscall_printf("sf=0 strong: MAC3=%d IR3=0x%04x FLAG=0x%08x F22=%u\n",
+                      mac3, ir3 & 0xffff, flag, f22);
+    // MAC3 = 0x10000 -> IR3 saturated to 0x7FFF
+    cester_assert_uint_eq(0x7fff, ir3);
+    // MAC3 >> 12 = 0x10000 >> 12 = 16 -> in range -> FLAG.22 NOT set
+    cester_assert_uint_eq(0, f22);
+)
+
+// Counter-test: MAC3 >> 12 exceeds range -> FLAG.22 SHOULD be set
+CESTER_TEST(prec_rtps_sf0_ir3_flag_set, gte_tests,
+    gte_set_identity_rotation();
+    cop2_putc(5, 0);
+    cop2_putc(6, 0);
+    cop2_putc(7, 8);  // TRZ = 8, so MAC3 = 8<<12 + VZ*0x1000
+    cop2_putc(24, 0);
+    cop2_putc(25, 0);
+    cop2_putc(26, 200);
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+    // VZ = 0x7FF0 -> MAC3 = 8*4096 + 0x7FF0*0x1000 = 0x8000 + 0x7FF0000 = 0x7FF8000
+    // MAC3 >> 12 = 0x7FF8 -> in range? 0x7FF8 < 0x7FFF -> yes, still in range
+    // Need TRZ large enough: TRZ = 0x7FFF -> MAC3 = 0x7FFF<<12 = 0x7FFF000
+    // MAC3>>12 = 0x7FFF -> at boundary. With VZ=1: MAC3 = 0x7FFF000 + 0x1000 = 0x8000000
+    // MAC3>>12 = 0x8000 -> OUT of range -> FLAG.22 should be set
+    cop2_putc(7, 0x7fff);
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 1);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(0, 0));
+    int32_t mac3;
+    uint32_t ir3, flag;
+    cop2_get(27, mac3);
+    cop2_get(11, ir3);
+    flag = gte_read_flag();
+    uint32_t f22 = (flag >> 22) & 1;
+    ramsyscall_printf("sf=0 flag set: MAC3=%d IR3=0x%04x FLAG=0x%08x F22=%u\n",
+                      mac3, ir3 & 0xffff, flag, f22);
+    // MAC3>>12 = 0x8000 -> exceeds 0x7FFF -> FLAG.22 SHOULD be set
+    cester_assert_uint_eq(1, f22);
+)
diff --git a/src/mips/tests/gte/gte-regio.c b/src/mips/tests/gte/gte-regio.c
new file mode 100644
index 000000000..e5df15026
--- /dev/null
+++ b/src/mips/tests/gte/gte-regio.c
@@ -0,0 +1,374 @@
+// GTE register I/O tests: data/control register read/write, sign extension,
+// SXY FIFO, IRGB/ORGB, LZCS/LZCR, FLAG register, CTC2 sign extension.
+
+// ==========================================================================
+// Data register roundtrip and sign/zero extension
+// ==========================================================================
+
+CESTER_TEST(regio_mac0_roundtrip, gte_tests,
+    cop2_put(24, 0x12345678);
+    uint32_t out;
+    cop2_get(24, out);
+    cester_assert_uint_eq(0x12345678, out);
+)
+
+CESTER_TEST(regio_mac1_roundtrip, gte_tests,
+    cop2_put(25, 0xdeadbeef);
+    uint32_t out;
+    cop2_get(25, out);
+    cester_assert_uint_eq(0xdeadbeef, out);
+)
+
+CESTER_TEST(regio_ir0_sign_extend, gte_tests,
+    cop2_put(8, 0x0000ffff);
+    uint32_t out;
+    cop2_get(8, out);
+    cester_assert_uint_eq(0xffffffff, out);
+)
+
+CESTER_TEST(regio_ir1_sign_extend, gte_tests,
+    cop2_put(9, 0x00008000);
+    uint32_t out;
+    cop2_get(9, out);
+    cester_assert_uint_eq(0xffff8000, out);
+)
+
+CESTER_TEST(regio_ir2_positive, gte_tests,
+    cop2_put(10, 0x00001234);
+    uint32_t out;
+    cop2_get(10, out);
+    cester_assert_uint_eq(0x00001234, out);
+)
+
+CESTER_TEST(regio_ir3_positive, gte_tests,
+    cop2_put(11, 0x00007fff);
+    uint32_t out;
+    cop2_get(11, out);
+    cester_assert_uint_eq(0x00007fff, out);
+)
+
+CESTER_TEST(regio_vz0_sign_extend, gte_tests,
+    cop2_put(1, 0x0000ff00);
+    uint32_t out;
+    cop2_get(1, out);
+    cester_assert_uint_eq(0xffffff00, out);
+)
+
+CESTER_TEST(regio_vxy0_packed, gte_tests,
+    cop2_put(0, 0x00640032);
+    uint32_t out;
+    cop2_get(0, out);
+    cester_assert_uint_eq(0x00640032, out);
+)
+
+CESTER_TEST(regio_otz_zero_extend, gte_tests,
+    cop2_put(7, 0xffffffff);
+    uint32_t out;
+    cop2_get(7, out);
+    cester_assert_uint_eq(0x0000ffff, out);
+)
+
+CESTER_TEST(regio_sz_zero_extend, gte_tests,
+    cop2_put(16, 0xdeadbeef);
+    uint32_t out;
+    cop2_get(16, out);
+    cester_assert_uint_eq(0x0000beef, out);
+)
+
+CESTER_TEST(regio_rgbc_roundtrip, gte_tests,
+    cop2_put(6, 0xaa554080);
+    uint32_t out;
+    cop2_get(6, out);
+    cester_assert_uint_eq(0xaa554080, out);
+)
+
+CESTER_TEST(regio_res1_readwrite, gte_tests,
+    cop2_put(23, 0xdeadbeef);
+    uint32_t out;
+    cop2_get(23, out);
+    cester_assert_uint_eq(0xdeadbeef, out);
+)
+
+// ==========================================================================
+// SXY FIFO
+// ==========================================================================
+
+CESTER_TEST(regio_sxy_fifo_push, gte_tests,
+    cop2_put(12, 0x00010002);
+    cop2_put(13, 0x00030004);
+    cop2_put(14, 0x00050006);
+    cop2_put(15, 0x00070008);
+    uint32_t sxy0, sxy1, sxy2;
+    cop2_get(12, sxy0);
+    cop2_get(13, sxy1);
+    cop2_get(14, sxy2);
+    cester_assert_uint_eq(0x00030004, sxy0);
+    cester_assert_uint_eq(0x00050006, sxy1);
+    cester_assert_uint_eq(0x00070008, sxy2);
+)
+
+CESTER_TEST(regio_sxyp_read_returns_sxy2, gte_tests,
+    cop2_put(14, 0xaabbccdd);
+    uint32_t sxyp;
+    cop2_get(15, sxyp);
+    cester_assert_uint_eq(0xaabbccdd, sxyp);
+)
+
+CESTER_TEST(regio_sxy_fifo_triple_push, gte_tests,
+    cop2_put(15, 0x11111111);
+    cop2_put(15, 0x22222222);
+    cop2_put(15, 0x33333333);
+    uint32_t sxy0, sxy1, sxy2;
+    cop2_get(12, sxy0);
+    cop2_get(13, sxy1);
+    cop2_get(14, sxy2);
+    cester_assert_uint_eq(0x11111111, sxy0);
+    cester_assert_uint_eq(0x22222222, sxy1);
+    cester_assert_uint_eq(0x33333333, sxy2);
+)
+
+// ==========================================================================
+// IRGB / ORGB
+// ==========================================================================
+
+CESTER_TEST(regio_irgb_expand, gte_tests,
+    cop2_put(28, 0x7fff);
+    __asm__ volatile("nop; nop; nop; nop");
+    uint32_t ir1, ir2, ir3;
+    cop2_get(9, ir1);
+    cop2_get(10, ir2);
+    cop2_get(11, ir3);
+    cester_assert_uint_eq(0x00000f80, ir1);
+    cester_assert_uint_eq(0x00000f80, ir2);
+    cester_assert_uint_eq(0x00000f80, ir3);
+)
+
+CESTER_TEST(regio_irgb_individual, gte_tests,
+    cop2_put(28, 0x000a);  // R=10, G=0, B=0
+    __asm__ volatile("nop; nop; nop; nop");
+    uint32_t ir1, ir2, ir3;
+    cop2_get(9, ir1);
+    cop2_get(10, ir2);
+    cop2_get(11, ir3);
+    cester_assert_uint_eq(0x00000500, ir1);  // 10 << 7
+    cester_assert_uint_eq(0x00000000, ir2);
+    cester_assert_uint_eq(0x00000000, ir3);
+)
+
+CESTER_TEST(regio_orgb_pack, gte_tests,
+    cop2_put(9, 0x0f80);
+    cop2_put(10, 0x0f80);
+    cop2_put(11, 0x0f80);
+    uint32_t orgb;
+    cop2_get(29, orgb);
+    cester_assert_uint_eq(0x7fff, orgb);
+)
+
+// ORGB saturates, not truncates (psx-spx correct, Sony SDK wrong)
+CESTER_TEST(regio_orgb_saturate_negative, gte_tests,
+    cop2_put(9, 0xffff8000);  // IR1 = -32768 (negative)
+    cop2_put(10, 0x00002000); // IR2 = 8192 (large positive)
+    cop2_put(11, 0x00000380); // IR3 = 896 (normal)
+    uint32_t orgb;
+    cop2_get(29, orgb);
+    uint32_t r = orgb & 0x1f;
+    uint32_t g = (orgb >> 5) & 0x1f;
+    uint32_t b = (orgb >> 10) & 0x1f;
+    cester_assert_uint_eq(0, r);    // negative saturated to 0
+    cester_assert_uint_eq(31, g);   // large saturated to 0x1f
+    cester_assert_uint_eq(7, b);    // 896 >> 7 = 7
+)
+
+CESTER_TEST(regio_orgb_saturate_large, gte_tests,
+    cop2_put(9, 0x1000);
+    cop2_put(10, 0x1000);
+    cop2_put(11, 0x1000);
+    uint32_t orgb;
+    cop2_get(29, orgb);
+    // 0x1000>>7 = 0x20 = 32, saturated to 31
+    cester_assert_uint_eq(0x7fff, orgb);
+)
+
+// ==========================================================================
+// LZCS / LZCR
+// ==========================================================================
+
+CESTER_TEST(regio_lzcr_zero, gte_tests,
+    cop2_put(30, 0x00000000);
+    uint32_t lzcr;
+    cop2_get(31, lzcr);
+    cester_assert_uint_eq(32, lzcr);
+)
+
+CESTER_TEST(regio_lzcr_all_ones, gte_tests,
+    cop2_put(30, 0xffffffff);
+    uint32_t lzcr;
+    cop2_get(31, lzcr);
+    cester_assert_uint_eq(32, lzcr);
+)
+
+CESTER_TEST(regio_lzcr_one, gte_tests,
+    cop2_put(30, 0x00000001);
+    uint32_t lzcr;
+    cop2_get(31, lzcr);
+    cester_assert_uint_eq(31, lzcr);
+)
+
+CESTER_TEST(regio_lzcr_msb_set, gte_tests,
+    cop2_put(30, 0x80000000);
+    uint32_t lzcr;
+    cop2_get(31, lzcr);
+    cester_assert_uint_eq(1, lzcr);
+)
+
+CESTER_TEST(regio_lzcr_positive_mid, gte_tests,
+    cop2_put(30, 0x00010000);
+    uint32_t lzcr;
+    cop2_get(31, lzcr);
+    cester_assert_uint_eq(15, lzcr);
+)
+
+CESTER_TEST(regio_lzcr_negative_mid, gte_tests,
+    cop2_put(30, 0xfffe0000);
+    uint32_t lzcr;
+    cop2_get(31, lzcr);
+    cester_assert_uint_eq(15, lzcr);
+)
+
+// ==========================================================================
+// FLAG register
+// ==========================================================================
+
+CESTER_TEST(regio_flag_write_mask, gte_tests,
+    cop2_putc(31, 0xffffffff);
+    uint32_t flag = gte_read_flag();
+    cester_assert_uint_eq(0xfffff000, flag);
+)
+
+CESTER_TEST(regio_flag_low_bits_masked, gte_tests,
+    cop2_putc(31, 0x00000fff);
+    uint32_t flag = gte_read_flag();
+    cester_assert_uint_eq(0, flag);
+)
+
+CESTER_TEST(regio_flag_bit12_no_summary, gte_tests,
+    cop2_putc(31, (1 << 12));
+    uint32_t flag = gte_read_flag();
+    cester_assert_uint_eq((1 << 12), flag);
+)
+
+CESTER_TEST(regio_flag_bits19_22_no_summary, gte_tests,
+    uint32_t flag;
+    int ok = 1;
+    int i;
+    for (i = 19; i <= 22; i++) {
+        cop2_putc(31, (1u << i));
+        flag = gte_read_flag();
+        if (flag != (1u << i)) ok = 0;
+    }
+    cester_assert_int_eq(1, ok);
+)
+
+CESTER_TEST(regio_flag_bits13_18_set_summary, gte_tests,
+    uint32_t flag;
+    int ok = 1;
+    int i;
+    for (i = 13; i <= 18; i++) {
+        cop2_putc(31, (1u << i));
+        flag = gte_read_flag();
+        if (flag != ((1u << i) | (1u << 31))) ok = 0;
+    }
+    cester_assert_int_eq(1, ok);
+)
+
+CESTER_TEST(regio_flag_bits23_30_set_summary, gte_tests,
+    uint32_t flag;
+    int ok = 1;
+    int i;
+    for (i = 23; i <= 30; i++) {
+        cop2_putc(31, (1u << i));
+        flag = gte_read_flag();
+        if (flag != ((1u << i) | (1u << 31))) ok = 0;
+    }
+    cester_assert_int_eq(1, ok);
+)
+
+// ==========================================================================
+// Control register sign extension
+// ==========================================================================
+
+CESTER_TEST(regio_ctrl_r33_sign_extend, gte_tests,
+    cop2_putc(4, 0x00008000);
+    uint32_t out;
+    cop2_getc(4, out);
+    cester_assert_uint_eq(0xffff8000, out);
+)
+
+CESTER_TEST(regio_ctrl_zsf3_sign_extend, gte_tests,
+    cop2_putc(29, 0x0000ffff);
+    uint32_t out;
+    cop2_getc(29, out);
+    cester_assert_uint_eq(0xffffffff, out);
+)
+
+// H register sign-extension bug (psx-spx documented, Sony omitted)
+CESTER_TEST(regio_h_sign_extension_bug, gte_tests,
+    cop2_putc(26, 0x8000);
+    uint32_t h;
+    cop2_getc(26, h);
+    cester_assert_uint_eq(0xffff8000, h);
+)
+
+CESTER_TEST(regio_h_positive, gte_tests,
+    cop2_putc(26, 0x7fff);
+    uint32_t h;
+    cop2_getc(26, h);
+    cester_assert_uint_eq(0x00007fff, h);
+)
+
+// All single-16bit control regs sign-extend
+CESTER_TEST(regio_ctc2_sign_extend_all, gte_tests,
+    uint32_t out;
+    int ok = 1;
+    // R33(4), L33(12), LB3(20), H(26), DQA(27), ZSF3(29), ZSF4(30)
+    cop2_putc(4, 0x8000);  cop2_getc(4, out);  if (out != 0xffff8000) ok = 0;
+    cop2_putc(12, 0x8000); cop2_getc(12, out); if (out != 0xffff8000) ok = 0;
+    cop2_putc(20, 0x8000); cop2_getc(20, out); if (out != 0xffff8000) ok = 0;
+    cop2_putc(26, 0x8000); cop2_getc(26, out); if (out != 0xffff8000) ok = 0;
+    cop2_putc(27, 0x8000); cop2_getc(27, out); if (out != 0xffff8000) ok = 0;
+    cop2_putc(29, 0x8000); cop2_getc(29, out); if (out != 0xffff8000) ok = 0;
+    cop2_putc(30, 0x8000); cop2_getc(30, out); if (out != 0xffff8000) ok = 0;
+    cester_assert_int_eq(1, ok);
+)
+
+// lm flag clamp behavior
+CESTER_TEST(regio_lm_clamp, gte_tests,
+    // GPF sf=1 lm=0: IR clamp -0x8000..0x7fff
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0xffff8000);
+    cop2_put(10, 0x100);
+    cop2_put(11, 0x7fff);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 0));
+    int32_t mac1_lm0;
+    uint32_t ir1_lm0;
+    cop2_get(25, mac1_lm0);
+    cop2_get(9, ir1_lm0);
+    // GPF sf=1 lm=1
+    cop2_put(8, 0x1000);
+    cop2_put(9, 0xffff8000);
+    cop2_put(10, 0x100);
+    cop2_put(11, 0x7fff);
+    cop2_put(6, 0x00808080);
+    gte_clear_flag();
+    cop2_cmd(COP2_GPF(1, 1));
+    int32_t mac1_lm1;
+    uint32_t ir1_lm1;
+    cop2_get(25, mac1_lm1);
+    cop2_get(9, ir1_lm1);
+    cester_assert_int_eq(-32768, mac1_lm0);
+    cester_assert_int_eq(-32768, mac1_lm1);
+    cester_assert_uint_eq(0xffff8000, ir1_lm0);  // lm=0: stays -32768
+    cester_assert_uint_eq(0x00000000, ir1_lm1);  // lm=1: clamped to 0
+)
diff --git a/src/mips/tests/gte/gte-rtps.c b/src/mips/tests/gte/gte-rtps.c
new file mode 100644
index 000000000..338bfa3a5
--- /dev/null
+++ b/src/mips/tests/gte/gte-rtps.c
@@ -0,0 +1,224 @@
+// RTPS/RTPT: perspective transformation (single and triple)
+// Also covers division table behavior and screen coordinate saturation.
+
+CESTER_TEST(rtps_identity_center, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 1000);
+    gte_set_screen(160 << 16, 120 << 16, 200);
+    cop2_put(0, 0x00000000);  // V0 = (0, 0)
+    cop2_put(1, 0);           // VZ0 = 0
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sz3, sxy2;
+    cop2_get(19, sz3);
+    cop2_get(14, sxy2);
+    cester_assert_uint_eq(1000, sz3);
+    cester_assert_int_eq(160, (int16_t)(sxy2 & 0xffff));
+    cester_assert_int_eq(120, (int16_t)(sxy2 >> 16));
+)
+
+CESTER_TEST(rtps_offset_vertex, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(160 << 16, 120 << 16, 200);
+    cop2_put(0, (50 << 16) | (100 & 0xffff));
+    cop2_put(1, 500);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sz3;
+    cop2_get(19, sz3);
+    cester_assert_uint_eq(500, sz3);
+    // SX = 160 + 100*200/500 = 160 + 40 ~ 199 (division rounding)
+    // SY = 120 + 50*200/500 = 120 + 20 ~ 139
+    uint32_t sxy2;
+    cop2_get(14, sxy2);
+    int16_t sx = (int16_t)(sxy2 & 0xffff);
+    int16_t sy = (int16_t)(sxy2 >> 16);
+    ramsyscall_printf("RTPS offset: SX=%d SY=%d\n", sx, sy);
+    cester_assert_int_eq(199, sx);
+    cester_assert_int_eq(139, sy);
+    cester_assert_uint_eq(500, sz3);
+)
+
+// RTPS MAC output
+CESTER_TEST(rtps_mac_output, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(100, 200, 300);
+    gte_set_screen(0, 0, 200);
+    cop2_put(0, (50 << 16) | 10);  // V0 = (10, 50)
+    cop2_put(1, 500);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // Identity rotation: MAC = V + TR
+    cester_assert_int_eq(110, mac1);
+    cester_assert_int_eq(250, mac2);
+    cester_assert_int_eq(800, mac3);
+)
+
+// RTPS with Z=0 (division overflow)
+CESTER_TEST(rtps_division_overflow, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(0, 0, 200);
+    cop2_put(0, (0 << 16) | 100);
+    cop2_put(1, 1);  // VZ0 = 1, very small Z
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t flag;
+    flag = gte_read_flag();
+    // H=200, SZ3=1 -> H >= SZ3*2 (200 >= 2) -> division overflow FLAG.17
+    ramsyscall_printf("RTPS div overflow: FLAG=0x%08x (bit17=%u)\n", flag, (flag >> 17) & 1);
+    uint32_t flag17 = (flag >> 17) & 1;
+    cester_assert_uint_eq(1, flag17);
+)
+
+// RTPS screen coordinate saturation
+CESTER_TEST(rtps_screen_saturation, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(0, 0, 200);
+    // Large X, small Z -> SX will exceed -0x400..0x3FF range
+    cop2_put(0, (0 << 16) | 0x7fff);  // VX0 = 32767
+    cop2_put(1, 100);                   // VZ0 = 100
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    uint32_t sxy2, flag;
+    cop2_get(14, sxy2);
+    flag = gte_read_flag();
+    int16_t sx = (int16_t)(sxy2 & 0xffff);
+    ramsyscall_printf("RTPS sat: SX=%d FLAG=0x%08x (bit14=%u)\n", sx, flag, (flag >> 14) & 1);
+    // SX should be saturated to 0x3FF
+    cester_assert_int_eq(0x3ff, sx);
+    uint32_t flag14 = (flag >> 14) & 1;
+    cester_assert_uint_eq(1, flag14);  // FLAG.14 = SX2 saturated
+)
+
+// RTPS depth cue output (MAC0/IR0)
+CESTER_TEST(rtps_depth_cue, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    cop2_putc(24, 0);
+    cop2_putc(25, 0);
+    cop2_putc(26, 200);
+    cop2_putc(27, 0xfffff880);  // DQA = -1920 (negative)
+    cop2_putc(28, 0x01000000);  // DQB = 16777216
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(1, 0));
+    int32_t mac0;
+    uint32_t ir0;
+    cop2_get(24, mac0);
+    cop2_get(8, ir0);
+    ramsyscall_printf("RTPS depth: MAC0=%d IR0=0x%04x\n", mac0, ir0 & 0xffff);
+    // IR0 should be clamped to [0, 0x1000]
+    cester_assert_int_eq(-8388224, mac0);
+    cester_assert_uint_eq(0, ir0);
+)
+
+// RTPS with sf=0
+CESTER_TEST(rtps_sf0, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0x1000);
+    gte_set_screen(0, 0, 200);
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 0);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPS(0, 0));
+    int32_t mac3;
+    uint32_t ir3, sz3, flag;
+    cop2_get(27, mac3);
+    cop2_get(11, ir3);
+    cop2_get(19, sz3);
+    flag = gte_read_flag();
+    ramsyscall_printf("RTPS sf=0: MAC3=%d IR3=0x%04x SZ3=%u FLAG=0x%08x\n",
+                      mac3, ir3 & 0xffff, sz3, flag);
+    // sf=0: MAC3 = TRZ<<12 + rotation = 0x1000<<12 = 0x1000000 (no >>12 shift)
+    // IR3 uses Lm_B3_sf which checks MAC3>>12 for FLAG but clamps the unshifted value
+    cester_assert_int_eq(16777216, mac3);
+    cester_assert_uint_eq(0x7fff, ir3);
+    cester_assert_uint_eq(4096, sz3);
+    cester_assert_uint_eq(0, flag);
+)
+
+// RTPT: triple perspective transform
+CESTER_TEST(rtpt_three_vertices, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(160 << 16, 120 << 16, 200);
+    // V0 = (0, 0, 1000)
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 1000);
+    // V1 = (100, 0, 1000)
+    cop2_put(2, (0 << 16) | 100);
+    cop2_put(3, 1000);
+    // V2 = (0, 100, 1000)
+    cop2_put(4, (100 << 16) | 0);
+    cop2_put(5, 1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPT(1, 0));
+    uint32_t sxy0, sxy1, sxy2;
+    cop2_get(12, sxy0);
+    cop2_get(13, sxy1);
+    cop2_get(14, sxy2);
+    // V0 at origin -> (160, 120)
+    cester_assert_int_eq(160, (int16_t)(sxy0 & 0xffff));
+    cester_assert_int_eq(120, (int16_t)(sxy0 >> 16));
+    // V1 at (100,0,1000) -> SX ~ 180
+    int16_t sx1 = (int16_t)(sxy1 & 0xffff);
+    int16_t sy1 = (int16_t)(sxy1 >> 16);
+    ramsyscall_printf("RTPT: V1=(%d,%d) V2=(%d,%d)\n", sx1, sy1,
+                      (int16_t)(sxy2 & 0xffff), (int16_t)(sxy2 >> 16));
+    cester_assert_int_eq(120, sy1);  // Y unchanged
+)
+
+// RTPT: FLAG accumulates across all three vertices
+CESTER_TEST(rtpt_flag_accumulates, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(0, 0, 200);
+    // V0: normal
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 1000);
+    // V1: will cause SX saturation (large X, small Z)
+    cop2_put(2, (0 << 16) | 0x7fff);
+    cop2_put(3, 100);
+    // V2: normal
+    cop2_put(4, 0x00000000);
+    cop2_put(5, 1000);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPT(1, 0));
+    uint32_t flag;
+    flag = gte_read_flag();
+    // FLAG should have SX2 saturation from V1, even though V2 was fine
+    ramsyscall_printf("RTPT flag accum: FLAG=0x%08x\n", flag);
+    // Division overflow from V1 (H=200, SZ3=100, 200 >= 200)
+    uint32_t flag17 = (flag >> 17) & 1;
+    cester_assert_uint_eq(1, flag17);
+)
+
+// RTPT pushes SZ FIFO correctly
+CESTER_TEST(rtpt_sz_fifo, gte_tests,
+    gte_set_identity_rotation();
+    gte_set_translation(0, 0, 0);
+    gte_set_screen(160 << 16, 120 << 16, 200);
+    cop2_put(0, 0x00000000);
+    cop2_put(1, 100);
+    cop2_put(2, 0x00000000);
+    cop2_put(3, 200);
+    cop2_put(4, 0x00000000);
+    cop2_put(5, 300);
+    gte_clear_flag();
+    cop2_cmd(COP2_RTPT(1, 0));
+    uint32_t sz1, sz2, sz3;
+    cop2_get(17, sz1);
+    cop2_get(18, sz2);
+    cop2_get(19, sz3);
+    cester_assert_uint_eq(100, sz1);
+    cester_assert_uint_eq(200, sz2);
+    cester_assert_uint_eq(300, sz3);
+)
diff --git a/src/mips/tests/gte/gte-sqr.c b/src/mips/tests/gte/gte-sqr.c
new file mode 100644
index 000000000..615fa9814
--- /dev/null
+++ b/src/mips/tests/gte/gte-sqr.c
@@ -0,0 +1,88 @@
+// SQR: square of IR vector
+
+CESTER_TEST(sqr_shifted, gte_tests,
+    cop2_put(9, 0x1000);   // 1.0
+    cop2_put(10, 0x0800);  // 0.5
+    cop2_put(11, 0x2000);  // 2.0
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(1, 0));
+    uint32_t ir1, ir2, ir3;
+    cop2_get(9, ir1);
+    cop2_get(10, ir2);
+    cop2_get(11, ir3);
+    cester_assert_uint_eq(0x1000, ir1);  // 1.0^2 = 1.0
+    cester_assert_uint_eq(0x0400, ir2);  // 0.5^2 = 0.25
+    cester_assert_uint_eq(0x4000, ir3);  // 2.0^2 = 4.0 (no saturation, lm=0)
+)
+
+CESTER_TEST(sqr_unshifted, gte_tests,
+    cop2_put(9, 4);
+    cop2_put(10, 5);
+    cop2_put(11, 6);
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(0, 0));
+    uint32_t ir1, ir2, ir3;
+    cop2_get(9, ir1);
+    cop2_get(10, ir2);
+    cop2_get(11, ir3);
+    cester_assert_uint_eq(16, ir1);
+    cester_assert_uint_eq(25, ir2);
+    cester_assert_uint_eq(36, ir3);
+)
+
+// SQR sets MAC1-3 as well
+CESTER_TEST(sqr_mac_output, gte_tests,
+    cop2_put(9, 100);
+    cop2_put(10, 200);
+    cop2_put(11, 300);
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(0, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    cester_assert_int_eq(10000, mac1);
+    cester_assert_int_eq(40000, mac2);
+    cester_assert_int_eq(90000, mac3);
+)
+
+// SQR with IR saturation (shifted, result > 0x7fff with lm=0)
+CESTER_TEST(sqr_saturation_shifted, gte_tests,
+    cop2_put(9, 0x4000);  // 4.0 in 4.12; 4^2 = 16, >>12 = 0x4000 (fits)
+    cop2_put(10, 0x5a82); // ~5.656 (sqrt(32)); 32 >>12 = 0x8000 = saturates
+    cop2_put(11, 0x7fff); // max positive; 0x7fff^2 >>12 = huge, saturates
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(1, 0));
+    uint32_t ir1, ir2, ir3;
+    uint32_t flag;
+    cop2_get(9, ir1);
+    cop2_get(10, ir2);
+    cop2_get(11, ir3);
+    flag = gte_read_flag();
+    ramsyscall_printf("SQR sat: IR1=0x%04x IR2=0x%04x IR3=0x%04x FLAG=0x%08x\n",
+                      ir1 & 0xffff, ir2 & 0xffff, ir3 & 0xffff, flag);
+    cester_assert_uint_eq(0x7fff, ir1 & 0xffff);
+    cester_assert_uint_eq(0x7fff, ir2 & 0xffff);
+    cester_assert_uint_eq(0x7fff, ir3 & 0xffff);
+    cester_assert_uint_eq(0x81c00000, flag);
+)
+
+// SQR with negative input (result should still be positive: square)
+CESTER_TEST(sqr_negative_input, gte_tests,
+    cop2_put(9, 0xfffffff6);  // -10 (sign-extended)
+    cop2_put(10, 0xffffffce); // -50
+    cop2_put(11, 0xffffff9c); // -100
+    gte_clear_flag();
+    cop2_cmd(COP2_SQR(0, 0));
+    int32_t mac1, mac2, mac3;
+    cop2_get(25, mac1);
+    cop2_get(26, mac2);
+    cop2_get(27, mac3);
+    // Squares of negative numbers are positive
+    // But GTE multiplies IR*IR where IR is 16-bit signed
+    // -10 * -10 = 100, -50 * -50 = 2500, -100 * -100 = 10000
+    ramsyscall_printf("SQR neg: MAC1=%d MAC2=%d MAC3=%d\n", mac1, mac2, mac3);
+    cester_assert_int_eq(100, mac1);
+    cester_assert_int_eq(2500, mac2);
+    cester_assert_int_eq(10000, mac3);
+)
diff --git a/src/mips/tests/gte/gte.c b/src/mips/tests/gte/gte.c
new file mode 100644
index 000000000..43fd73913
--- /dev/null
+++ b/src/mips/tests/gte/gte.c
@@ -0,0 +1,138 @@
+/*
+
+MIT License
+
+Copyright (c) 2026 PCSX-Redux authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+// GTE (Geometry Transformation Engine) hardware validation test suite.
+// All test expectations verified against SCPH-5501 silicon.
+//
+// Sub-test files are included into this single compilation unit
+// because libcester requires a single TU via __BASE_FILE__ re-include.
+
+#include "common/hardware/cop2.h"
+#include "common/syscalls/syscalls.h"
+
+// clang-format off
+
+// ==========================================================================
+// Helper functions (guarded against cester double-include)
+// ==========================================================================
+
+#ifndef GTE_HELPERS_DEFINED
+#define GTE_HELPERS_DEFINED
+
+static inline void gte_enable(void) {
+    uint32_t sr;
+    __asm__ volatile("mfc0 %0, $12" : "=r"(sr));
+    sr |= 0x40000000;
+    __asm__ volatile("mtc0 %0, $12; nop; nop" : : "r"(sr));
+}
+
+static inline void gte_clear_flag(void) {
+    cop2_putc(31, 0);
+}
+
+static inline uint32_t gte_read_flag(void) {
+    uint32_t flag;
+    cop2_getc(31, flag);
+    return flag;
+}
+
+static inline void gte_set_identity_rotation(void) {
+    cop2_putc(0, 0x00001000);
+    cop2_putc(1, 0x00000000);
+    cop2_putc(2, 0x00001000);
+    cop2_putc(3, 0x00000000);
+    cop2_putc(4, 0x1000);
+}
+
+static inline void gte_set_simple_light(void) {
+    cop2_putc(8, 0x00000000);
+    cop2_putc(9, 0x00000000);
+    cop2_putc(10, 0x00000000);
+    cop2_putc(11, 0x00000000);
+    cop2_putc(12, 0x1000);
+}
+
+static inline void gte_set_white_light_color(void) {
+    cop2_putc(16, 0x00001000);
+    cop2_putc(17, 0x00000000);
+    cop2_putc(18, 0x00001000);
+    cop2_putc(19, 0x00000000);
+    cop2_putc(20, 0x1000);
+}
+
+static inline void gte_set_zero_bk(void) {
+    cop2_putc(13, 0);
+    cop2_putc(14, 0);
+    cop2_putc(15, 0);
+}
+
+static inline void gte_set_far_color(int32_t r, int32_t g, int32_t b) {
+    cop2_putc(21, r);
+    cop2_putc(22, g);
+    cop2_putc(23, b);
+}
+
+static inline void gte_set_translation(int32_t x, int32_t y, int32_t z) {
+    cop2_putc(5, x);
+    cop2_putc(6, y);
+    cop2_putc(7, z);
+}
+
+static inline void gte_set_screen(int32_t ofx, int32_t ofy, uint16_t h) {
+    cop2_putc(24, ofx);
+    cop2_putc(25, ofy);
+    cop2_putc(26, h);
+    cop2_putc(27, 0);
+    cop2_putc(28, 0);
+}
+
+#endif // GTE_HELPERS_DEFINED
+
+#undef unix
+#define CESTER_NO_SIGNAL
+#define CESTER_NO_TIME
+#define EXIT_SUCCESS 0
+#define EXIT_FAILURE 1
+#include "exotic/cester.h"
+
+CESTER_BEFORE_ALL(gte_tests,
+    gte_enable();
+)
+
+// Include sub-test files
+#include "gte-regio.c"
+#include "gte-nclip.c"
+#include "gte-avsz.c"
+#include "gte-sqr.c"
+#include "gte-op.c"
+#include "gte-gpf-gpl.c"
+#include "gte-rtps.c"
+#include "gte-mvmva.c"
+#include "gte-depthcue.c"
+#include "gte-lighting.c"
+#include "gte-edgecase.c"
+#include "gte-precision.c"
+#include "gte-encoding.c"
diff --git a/tests/pcsxrunner/gte.cc b/tests/pcsxrunner/gte.cc
new file mode 100644
index 000000000..57d78286d
--- /dev/null
+++ b/tests/pcsxrunner/gte.cc
@@ -0,0 +1,35 @@
+/***************************************************************************
+ *   Copyright (C) 2026 PCSX-Redux authors                                 *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.           *
+ ***************************************************************************/
+
+#include "gtest/gtest.h"
+#include "main/main.h"
+
+TEST(GTE, Interpreter) {
+    MainInvoker invoker("-no-ui", "-run", "-bios", "src/mips/openbios/openbios.bin", "-testmode", "-interpreter",
+                        "-luacov", "-loadexe", "src/mips/tests/gte/gte.ps-exe");
+    int ret = invoker.invoke();
+    EXPECT_EQ(ret, 0);
+}
+
+TEST(GTE, Dynarec) {
+    MainInvoker invoker("-no-ui", "-run", "-bios", "src/mips/openbios/openbios.bin", "-testmode", "-dynarec",
+                        "-luacov", "-loadexe", "src/mips/tests/gte/gte.ps-exe");
+    int ret = invoker.invoke();
+    EXPECT_EQ(ret, 0);
+}
diff --git a/vsprojects/core/core.vcxproj b/vsprojects/core/core.vcxproj
index c8c3617a3..430513445 100644
--- a/vsprojects/core/core.vcxproj
+++ b/vsprojects/core/core.vcxproj
@@ -139,12 +139,13 @@
     <ClCompile Include="..\..\src\core\DynaRec_x64\regAllocation.cc" />
     <ClCompile Include="..\..\src\core\DynaRec_x64\symbols.cc" />
     <ClCompile Include="..\..\src\core\eventslua.cc" />
+    <ClCompile Include="..\..\src\core\gte-instructions.cc" />
+    <ClCompile Include="..\..\src\core\gte-transfer.cc" />
     <ClCompile Include="..\..\src\core\patchmanager.cc" />
     <ClCompile Include="..\..\src\core\pio-cart.cc" />
     <ClCompile Include="..\..\src\core\gdb-server.cc" />
     <ClCompile Include="..\..\src\core\gpu.cc" />
     <ClCompile Include="..\..\src\core\gpulogger.cc" />
-    <ClCompile Include="..\..\src\core\gte.cc" />
     <ClCompile Include="..\..\src\core\kernel.cc" />
     <ClCompile Include="..\..\src\core\kernellog.cc" />
     <ClCompile Include="..\..\src\core\luaiso.cc" />
@@ -192,6 +193,7 @@
     <ClInclude Include="..\..\src\core\DynaRec_x64\recompiler.h" />
     <ClInclude Include="..\..\src\core\DynaRec_x64\regAllocation.h" />
     <ClInclude Include="..\..\src\core\eventslua.h" />
+    <ClInclude Include="..\..\src\core\gte-internal.h" />
     <ClInclude Include="..\..\src\core\patchmanager.h" />
     <ClInclude Include="..\..\src\core\pio-cart.h" />
     <ClInclude Include="..\..\src\core\gdb-server.h" />
diff --git a/vsprojects/core/core.vcxproj.filters b/vsprojects/core/core.vcxproj.filters
index ea375fdb1..6fd384b85 100644
--- a/vsprojects/core/core.vcxproj.filters
+++ b/vsprojects/core/core.vcxproj.filters
@@ -22,9 +22,6 @@
     <ClCompile Include="..\..\src\core\gpu.cc">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\core\gte.cc">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\core\kernel.cc">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -145,7 +142,15 @@
     <ClCompile Include="..\..\src\core\patchmanager.cc">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\core\ramlogger.cc" />
+    <ClCompile Include="..\..\src\core\gte-instructions.cc">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\core\gte-transfer.cc">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\core\ramlogger.cc">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\core\web-server.h">
@@ -298,7 +303,12 @@
     <ClInclude Include="..\..\src\core\patchmanager.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\src\core\ramlogger.h" />
+    <ClInclude Include="..\..\src\core\gte-internal.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\core\ramlogger.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="packages.config" />
diff --git a/vsprojects/gui/gui.vcxproj.filters b/vsprojects/gui/gui.vcxproj.filters
index 563743a78..ec2ad6ed2 100644
--- a/vsprojects/gui/gui.vcxproj.filters
+++ b/vsprojects/gui/gui.vcxproj.filters
@@ -109,8 +109,12 @@
     <ClCompile Include="..\..\src\gui\widgets\patches.cc">
       <Filter>Source Files\widgets</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\gui\widgets\zoomable-image.cc" />
-    <ClCompile Include="..\..\src\gui\widgets\ram-viewer.cc" />
+    <ClCompile Include="..\..\src\gui\widgets\ram-viewer.cc">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\gui\widgets\zoomable-image.cc">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\gui\gui.h">
@@ -212,8 +216,12 @@
     <ClInclude Include="..\..\src\gui\widgets\patches.h">
       <Filter>Header Files\widgets</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\src\gui\widgets\zoomable-image.h" />
-    <ClInclude Include="..\..\src\gui\widgets\ram-viewer.h" />
+    <ClInclude Include="..\..\src\gui\widgets\ram-viewer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\gui\widgets\zoomable-image.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="packages.config" />
diff --git a/vsprojects/tests/pcsxrunner/pcsxrunner.vcxproj b/vsprojects/tests/pcsxrunner/pcsxrunner.vcxproj
index dd4fdef08..eee7ad5e8 100644
--- a/vsprojects/tests/pcsxrunner/pcsxrunner.vcxproj
+++ b/vsprojects/tests/pcsxrunner/pcsxrunner.vcxproj
@@ -255,6 +255,7 @@
     <ClCompile Include="..\..\..\tests\pcsxrunner\cpu.cc" />
     <ClCompile Include="..\..\..\tests\pcsxrunner\dma.cc" />
     <ClCompile Include="..\..\..\tests\pcsxrunner\dumpproto.cc" />
+    <ClCompile Include="..\..\..\tests\pcsxrunner\gte.cc" />
     <ClCompile Include="..\..\..\tests\pcsxrunner\libc.cc" />
     <ClCompile Include="..\..\..\tests\pcsxrunner\lua.cc" />
     <ClCompile Include="..\..\..\tests\pcsxrunner\memcpy.cc" />
diff --git a/vsprojects/tests/pcsxrunner/pcsxrunner.vcxproj.filters b/vsprojects/tests/pcsxrunner/pcsxrunner.vcxproj.filters
index 10d27629b..2cca3fd97 100644
--- a/vsprojects/tests/pcsxrunner/pcsxrunner.vcxproj.filters
+++ b/vsprojects/tests/pcsxrunner/pcsxrunner.vcxproj.filters
@@ -36,6 +36,9 @@
     <ClCompile Include="..\..\..\tests\pcsxrunner\dma.cc">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\tests\pcsxrunner\gte.cc">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\tests\pcsxrunner\lua.cc">
       <Filter>Source Files</Filter>
     </ClCompile>