feat: Pipeline-level ORAM normalization + branchless oram_access_impl

scc-tw · scc-tw · commit 02bde9f2fa29 · 2026-04-03T23:38:39.000+08:00
Move ORAM scan from PUSH/POP handlers into dispatch_unit pipeline so
every sub-instruction does exactly 1 scan at identical cost:

- Add Phase D.oram between operand resolve and handler dispatch
- Branchless MUX computes addr/value/direction from decoded opcode:
  PUSH: addr=vm_sp-8, value=encode(plain_a), write
  POP:  addr=vm_sp, read
  else: addr=0, dummy read
- PUSH/POP handlers no longer call Oram::read/write directly;
  POP reads from VmExecution::oram_read_result staging field
- Remove per-DU dummy_scan (replaced by per-sub-insn scan)

Fix oram_access_impl branchless read/write:
- Previous code branched on is_write inside the 64-line inner loop
- New code always accumulates read result AND computes written word,
  then selects via bitmask — no data-dependent branches on access type

Fix benchmark OramPop setup: LOAD_CONST BB uses fallthrough instead
of JMP (which was incorrectly fixed-up to skip PUSH BBs).
Fix benchmark runner: use dispatch_unit() for setup instead of step()
(step() lacks Phase D.oram, so PUSH setup via step() never wrote ORAM).
diff --git a/runtime/bench/program_factory.cpp b/runtime/bench/program_factory.cpp
@@ -78,19 +78,37 @@ static TestBB make_nop_bb(uint32_t bb_id, uint8_t epoch_base, uint32_t N) {
     return bb;
 }
 
-/// Build a setup BB of exactly N insns with the given instructions + NOP padding + JMP.
-static TestBB make_setup_bb(uint32_t bb_id, uint8_t epoch_base,
-                            const std::vector<TestInstruction>& setup_insns,
-                            uint32_t target_bb_id, uint32_t N) {
-    auto bb = make_bb(bb_id, epoch_base);
-    for (const auto& insn : setup_insns)
-        bb.instructions.push_back(insn);
-    // Pad with NOPs, leaving room for JMP at end
-    while (bb.instructions.size() + 1 < N)
-        bb.instructions.push_back({VmOpcode::NOP, f_none(), 0, 0, 0});
-    // JMP to first measured BB (or next setup BB)
-    bb.instructions.push_back({VmOpcode::JMP, f_none(), 0, 0, target_bb_id});
-    return bb;
+/// Build setup BBs of exactly N insns each.
+///
+/// For N≥2: one BB with setup_insns + NOP padding + JMP (fits in N insns).
+/// For N=1: one BB per setup instruction (no JMP — fallthrough handles it).
+///
+/// Returns the BBs (may be 1 or many) and updates next_bb_id.
+static std::vector<TestBB> make_setup_bbs(uint32_t& next_bb_id,
+                                          uint8_t epoch_base,
+                                          const std::vector<TestInstruction>& setup_insns,
+                                          uint32_t target_bb_id, uint32_t N) {
+    std::vector<TestBB> result;
+
+    if (N >= 2) {
+        // All setup insns + NOPs + JMP fit in one N-insn BB
+        auto bb = make_bb(next_bb_id++, epoch_base);
+        for (const auto& insn : setup_insns)
+            bb.instructions.push_back(insn);
+        while (bb.instructions.size() + 1 < N)
+            bb.instructions.push_back({VmOpcode::NOP, f_none(), 0, 0, 0});
+        bb.instructions.push_back({VmOpcode::JMP, f_none(), 0, 0, target_bb_id});
+        result.push_back(std::move(bb));
+    } else {
+        // N=1: each setup insn is its own 1-insn BB (fallthrough to next)
+        for (const auto& insn : setup_insns) {
+            auto bb = make_bb(next_bb_id++, epoch_base);
+            bb.instructions.push_back(insn);
+            result.push_back(std::move(bb));
+        }
+    }
+
+    return result;
 }
 
 // ─── Trivial native for NATIVE_CALL benchmarks ────────────────────────
@@ -169,56 +187,49 @@ DUBenchProgram build_du_program(const OpcodeBenchSpec& spec,
     // ── Build setup BBs (untimed) ───────────────────────────────────
     uint32_t first_measured_id = 0;  // set after setup BBs
 
+    // Helper: add setup BBs and update setup_du_count.
+    auto add_setup = [&](const std::vector<TestInstruction>& insns) {
+        auto sbs = make_setup_bbs(next_bb_id, 0xA0, insns, 0, N);
+        prog.setup_du_count += static_cast<uint32_t>(sbs.size());
+        for (auto& sb : sbs)
+            bbs.push_back(std::move(sb));
+    };
+
     switch (spec.setup) {
-        case Setup::Reg1: {
+        case Setup::Reg1:
             pool.push_back(42);
-            auto sb = make_setup_bb(next_bb_id++, 0xA0,
-                {{VmOpcode::LOAD_CONST, f_pool(), spec.reg_a, 0, 0}},
-                0 /* target set later */, N);
-            bbs.push_back(std::move(sb));
-            prog.setup_du_count = 1;
+            add_setup({{VmOpcode::LOAD_CONST, f_pool(), spec.reg_a, 0, 0}});
             break;
-        }
-        case Setup::Reg2: {
+
+        case Setup::Reg2:
             pool.push_back(42);
             pool.push_back(3);
-            auto sb = make_setup_bb(next_bb_id++, 0xA0,
-                {{VmOpcode::LOAD_CONST, f_pool(), spec.reg_a, 0, 0},
-                 {VmOpcode::LOAD_CONST, f_pool(), spec.reg_b, 0, 1}},
-                0, N);
-            bbs.push_back(std::move(sb));
-            prog.setup_du_count = 1;
+            add_setup({{VmOpcode::LOAD_CONST, f_pool(), spec.reg_a, 0, 0},
+                       {VmOpcode::LOAD_CONST, f_pool(), spec.reg_b, 0, 1}});
             break;
-        }
-        case Setup::Memory: {
+
+        case Setup::Memory:
             pool.push_back(42);
-            auto sb = make_setup_bb(next_bb_id++, 0xA0,
-                {{VmOpcode::LOAD_CONST, f_pool(), 0, 0, 0},
-                 {VmOpcode::STORE, f_rm(), 0, 0, 0}},
-                0, N);
-            bbs.push_back(std::move(sb));
-            prog.setup_du_count = 1;
+            add_setup({{VmOpcode::LOAD_CONST, f_pool(), 0, 0, 0},
+                       {VmOpcode::STORE, f_rm(), 0, 0, 0}});
             break;
-        }
-        case Setup::OramPush: {
+
+        case Setup::OramPush:
             pool.push_back(42);
-            auto sb = make_setup_bb(next_bb_id++, 0xA0,
-                {{VmOpcode::LOAD_CONST, f_pool(), 0, 0, 0}},
-                0, N);
-            bbs.push_back(std::move(sb));
-            prog.setup_du_count = 1;
+            add_setup({{VmOpcode::LOAD_CONST, f_pool(), 0, 0, 0}});
             break;
-        }
+
         case Setup::OramPop: {
-            // Need K PUSHes to fill stack, then K POPs measured.
-            // Setup: 1 LOAD_CONST BB + K PUSH BBs.
+            // Setup: 1 LOAD_CONST BB + K PUSH BBs, all fallthrough (no JMP).
+            // The LOAD_CONST BB must NOT use add_setup() because add_setup's
+            // JMP gets fixed to first_measured_id, skipping the PUSH BBs.
             pool.push_back(42);
-            auto sb = make_setup_bb(next_bb_id++, 0xA0,
-                {{VmOpcode::LOAD_CONST, f_pool(), 0, 0, 0}},
-                0, N);
-            bbs.push_back(std::move(sb));
-            prog.setup_du_count = 1;
-
+            {
+                TestInstruction lc{VmOpcode::LOAD_CONST, f_pool(), 0, 0, 0};
+                bbs.push_back(make_measured_bb(next_bb_id++, 0xA0,
+                                               lc, N, false));
+                prog.setup_du_count++;
+            }
             // K PUSH BBs (each is a DU with 1 PUSH + N-1 NOP)
             for (uint32_t i = 0; i < K; ++i) {
                 TestInstruction push_insn{VmOpcode::PUSH, f_r(), 0, 0, 0};
@@ -228,36 +239,32 @@ DUBenchProgram build_du_program(const OpcodeBenchSpec& spec,
             }
             break;
         }
-        case Setup::Pool: {
+
+        case Setup::Pool:
             for (uint32_t i = 0; i < K; ++i)
                 pool.push_back(i + 100);
-            // Pool index cycles: aux = i % pool_count
             break;
-        }
-        case Setup::CtxWrite: {
+
+        case Setup::CtxWrite:
             pool.push_back(0x800);
-            auto sb = make_setup_bb(next_bb_id++, 0xA0,
-                {{VmOpcode::LOAD_CONST, f_pool(), 0, 0, 0}},
-                0, N);
-            bbs.push_back(std::move(sb));
-            prog.setup_du_count = 1;
+            add_setup({{VmOpcode::LOAD_CONST, f_pool(), 0, 0, 0}});
             break;
-        }
-        case Setup::NativeCall: {
-            // One transition entry for all NATIVE_CALL instructions.
-            // All point to the same trivial native.
+
+        case Setup::NativeCall:
             break;
-        }
+
         default:
             break;
     }
 
-    // Fix setup BB JMP targets → first measured BB
+    // Fix setup BB JMP targets → first measured BB (N≥2 only; N=1 uses fallthrough)
     first_measured_id = next_bb_id;
     for (auto& sb : bbs) {
-        auto& last = sb.instructions.back();
-        if (last.opcode == VmOpcode::JMP)
-            last.aux = first_measured_id;
+        if (!sb.instructions.empty()) {
+            auto& last = sb.instructions.back();
+            if (last.opcode == VmOpcode::JMP)
+                last.aux = first_measured_id;
+        }
     }
 
     // ── Build K measured BBs ────────────────────────────────────────
diff --git a/runtime/bench/runner.hpp b/runtime/bench/runner.hpp
@@ -50,9 +50,13 @@ std::vector<BenchResult> run_all(const RunConfig& cfg) {
             prog.blob.data(), prog.blob.size(), prog.seed, delta);
         if (!engine) return 0;
 
-        // Untimed: step through setup DUs
-        for (uint32_t i = 0; i < prog.setup_du_count * N; ++i) {
-            auto sr = engine->step();
+        // Untimed: run setup DUs via dispatch_unit (NOT step()).
+        //
+        // WHY dispatch_unit: step() does not have the pipeline-level
+        // ORAM scan (Phase D.oram).  PUSH setup via step() would fail
+        // to write to ORAM, causing POP to read garbage.
+        for (uint32_t i = 0; i < prog.setup_du_count; ++i) {
+            auto sr = engine->dispatch_unit();
             if (!sr || *sr == VmResult::Halted) return 0;
         }
 
diff --git a/runtime/include/handler_impls.hpp b/runtime/include/handler_impls.hpp
@@ -140,36 +140,42 @@ struct HandlerTraits<VmOpcode::STORE, P> {
 };
 
 /// PUSH: register -> ORAM stack.
-/// Encode plaintext to memory domain, write via ORAM.  No register result.
+///
+/// Doc 19 pipeline-level ORAM: the ORAM write has already been executed
+/// by Phase D.oram in dispatch_unit (branchless, before handler dispatch).
+/// The handler only updates vm_sp.  No direct Oram::write call.
 template<typename P>
 struct HandlerTraits<VmOpcode::PUSH, P> {
     static constexpr auto security_class = SecurityClass::A;
     using oram_tag = UsesOramTag;
     template<typename Oram>
-    static HandlerResult exec(VmExecution& e, VmEpoch&, VmOramState& o,
-                               const VmImmutable& im, const DecodedInsn& i) noexcept {
+    static HandlerResult exec(VmExecution& e, VmEpoch&, VmOramState&,
+                               const VmImmutable&, const DecodedInsn&) noexcept {
         if (e.vm_sp < 8) return tl::make_unexpected(DiagnosticCode::StackOverflow);
         e.vm_sp -= 8;
-        MemVal mem(im.mem.encode_lut().apply(i.plain_a));
-        Oram::write(o, e.vm_sp, mem);
+        // ORAM write already done by pipeline Phase D.oram at (vm_sp - 8)
+        // with encode_lut().apply(plain_a).
         return {};
     }
 };
 
 /// POP: ORAM stack -> register.
-/// Read MemVal from ORAM, decode from memory domain to plaintext.
-/// Pipeline will FPE-encode regs[dst].
+///
+/// Doc 19 pipeline-level ORAM: the ORAM read has already been executed
+/// by Phase D.oram in dispatch_unit.  The result is in exec.oram_read_result.
+/// The handler decodes and writes to the destination register.
 template<typename P>
 struct HandlerTraits<VmOpcode::POP, P> {
     static constexpr auto security_class = SecurityClass::A;
     using oram_tag = UsesOramTag;
     template<typename Oram>
-    static HandlerResult exec(VmExecution& e, VmEpoch&, VmOramState& o,
+    static HandlerResult exec(VmExecution& e, VmEpoch&, VmOramState&,
                                const VmImmutable& im, const DecodedInsn& i) noexcept {
         if (e.vm_sp >= VM_OBLIVIOUS_SIZE) return tl::make_unexpected(DiagnosticCode::StackUnderflow);
-        MemVal mem = Oram::read(o, e.vm_sp);
+        // ORAM read already done by pipeline Phase D.oram at vm_sp.
+        // Result is in e.oram_read_result (raw MemVal bits).
         e.vm_sp += 8;
-        uint64_t plain = im.mem.decode_lut().apply(mem.bits);
+        uint64_t plain = im.mem.decode_lut().apply(e.oram_read_result);
         e.regs[i.reg_a] = RegVal(plain);
         return {};
     }
diff --git a/runtime/include/oram_strategy.hpp b/runtime/include/oram_strategy.hpp
@@ -70,16 +70,25 @@ struct RollingKeyOram {
     /// Scans all 64 cache lines and re-encrypts entire workspace.
     static void write(VmOramState& state, uint64_t offset, MemVal val) noexcept;
 
-    /// Unconditional dummy scan (Doc 19 §C.1, ORAM Invariant).
+    /// Unified ORAM access — always performs a full 64-line scan.
     ///
-    /// WHY: every dispatch_unit must produce the same ORAM access pattern
-    /// regardless of opcode mix.  A DU with PUSH/POP triggers real ORAM
-    /// scans; a DU with only ALU ops does not.  The dummy scan ensures
-    /// at least 1 full scan per DU, normalizing the memory bus frequency
-    /// to constant rate (Doc 19 Appendix C.4).
+    /// WHY unified (Doc 19 pipeline-level normalization):
+    ///   The dispatch_unit pipeline calls this once per sub-instruction,
+    ///   unconditionally.  Branchless MUX in the pipeline selects the
+    ///   address, value, and direction based on the decoded opcode:
+    ///     PUSH: addr=vm_sp-8, value=encoded, is_write=true
+    ///     POP:  addr=vm_sp,   value=0,       is_write=false
+    ///     else: addr=0,       value=0,       is_write=false (dummy)
     ///
-    /// Implementation: read-equivalent — full 64-line scan + re-encrypt +
-    /// nonce bump, identical cost to a real read.  Result discarded.
+    ///   Every sub-instruction does exactly 1 scan at the same cost.
+    ///   PUSH/POP handlers no longer call read/write directly.
+    ///
+    /// @return  read result (meaningful for POP; 0 for PUSH/dummy)
+    [[nodiscard]] static uint64_t access(VmOramState& state, uint64_t addr,
+                                         uint64_t write_value,
+                                         bool is_write) noexcept;
+
+    /// Unconditional dummy scan (legacy — kept for backward compatibility).
     static void dummy_scan(VmOramState& state) noexcept;
 };
 
@@ -103,8 +112,12 @@ struct DirectOram {
     /// Write 8 bytes to workspace at `offset` (direct indexed access).
     static void write(VmOramState& state, uint64_t offset, MemVal val) noexcept;
 
+    /// Unified access for DirectOram — direct indexed, no oblivious scan.
+    [[nodiscard]] static uint64_t access(VmOramState& state, uint64_t addr,
+                                         uint64_t write_value,
+                                         bool is_write) noexcept;
+
     /// No-op dummy scan — DirectOram does not need timing normalization.
-    /// DebugPolicy::constant_time == false, so timing leaks are acceptable.
     static void dummy_scan(VmOramState&) noexcept {}
 };
 
diff --git a/runtime/include/vm_state.hpp b/runtime/include/vm_state.hpp
@@ -196,6 +196,17 @@ struct alignas(64) VmExecution {
 
     uint64_t trash_regs[VM_REG_COUNT] = {};
 
+    // ── ORAM staging (Doc 19 pipeline-level normalization) ──────────────
+
+    /// Result from the per-sub-instruction unconditional ORAM scan.
+    ///
+    /// WHY staging: ORAM scans are moved from PUSH/POP handlers into the
+    /// dispatch_unit pipeline so every sub-instruction does exactly 1 scan.
+    /// POP handler reads this field instead of calling Oram::read directly.
+    /// PUSH handler ignores it (write-only, result is meaningless).
+    /// NOP/ALU handlers ignore it (dummy scan at offset 0, result discarded).
+    uint64_t oram_read_result = 0;
+
     // ── Doc 16 forward-secrecy state ────────────────────────────────────
 
     /// Current Speck-FPE key for register encoding/decoding.
diff --git a/runtime/src/oram_strategies.cpp b/runtime/src/oram_strategies.cpp
diff --git a/runtime/src/vm_engine.cpp b/runtime/src/vm_engine.cpp