Skip to content

Commit fa6035a

Browse files
committed
perf(runtime): reduce fixed overhead without changing security semantics (P1-P8)
Remove waste around mandatory crypto work while preserving all security invariants from doc 16, 17, and 19. No phases removed, no branch shapes changed, no crypto semantics altered. - P1: replace heap alloc in verify_bb_mac with stack buffer (VM_MAX_BB_INSN_CAP) - P2: replace O(n) find_bb_index with O(1) dense vector lookup - P3: hash exec.regs[] directly, eliminating 128-byte copy per instruction - P4: preexpand BLAKE3 [K||K] key once for Phase F + Phase G - P5: precompute bb_end_ip at load time - P6: pre-decode bb_enc_seed as uint64_t at load time - P7: build siphash_expand message prefix once outside 8-iteration loop - P8: hoist ORAM temp buffers out of 64-line loop, reuse keystream buffer
1 parent 1348f05 commit fa6035a

8 files changed

Lines changed: 182 additions & 67 deletions

File tree

common/include/vm/vm_context.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ constexpr uint8_t VM_BYTE_LANES = 8;
2525
/// Maximum nesting depth for shadow stack.
2626
constexpr uint8_t VM_MAX_NESTING = 8;
2727

28+
/// Compile-time upper bound on instructions per basic block.
29+
/// Used for stack-allocated MAC scratch buffers (avoids heap allocation on hot path).
30+
/// Blobs exceeding this limit are rejected at load time.
31+
constexpr uint32_t VM_MAX_BB_INSN_CAP = 1024;
32+
2833
/// Forward declaration for platform-specific native context.
2934
struct NativeContext;
3035

@@ -46,6 +51,14 @@ struct BBMetadata {
4651
uint16_t live_regs_bitmap;
4752
uint8_t bb_enc_seed[8];
4853
uint8_t epoch_seed[32];
54+
55+
// ── Derived fields (computed once at blob load, avoid repeated work) ──
56+
57+
/// entry_ip + insn_count_in_bb — avoids repeated addition on hot path.
58+
uint32_t bb_end_ip = 0;
59+
60+
/// bb_enc_seed decoded as native uint64_t — avoids repeated memcpy.
61+
uint64_t bb_enc_seed_u64 = 0;
4962
};
5063

5164
/// Epoch checkpoint for shadow stack (CALL_VM / RET_VM).

common/include/vm/vm_crypto.hpp

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,10 @@ inline void sip_round(uint64_t& v0, uint64_t& v1,
160160
///
161161
/// Each output word: SipHash(key, nonce || line || sub_index)
162162
///
163+
/// Optimization (P7): the 24-byte message prefix (nonce || line) is built once;
164+
/// only the final 8-byte sub_index is overwritten per iteration. This avoids
165+
/// rebuilding the fixed 16-byte prefix 8 times. Output is identical.
166+
///
163167
/// @param key 16-byte ORAM key
164168
/// @param nonce ORAM nonce
165169
/// @param line cache line index
@@ -168,12 +172,14 @@ inline void siphash_expand(const uint8_t key[16],
168172
uint64_t nonce,
169173
uint64_t line,
170174
uint64_t out[8]) noexcept {
175+
// Build the fixed prefix once: nonce(8) || line(8)
176+
uint8_t msg[24];
177+
std::memcpy(msg, &nonce, 8);
178+
std::memcpy(msg + 8, &line, 8);
179+
171180
for (uint64_t i = 0; i < 8; ++i) {
172-
// Build a 24-byte message: nonce(8) || line(8) || sub_index(8)
173-
uint8_t msg[24];
174-
std::memcpy(msg, &nonce, 8);
175-
std::memcpy(msg + 8, &line, 8);
176-
std::memcpy(msg + 16, &i, 8);
181+
// Only overwrite the sub_index portion
182+
std::memcpy(msg + 16, &i, 8);
177183
out[i] = siphash_2_4(key, msg, 24);
178184
}
179185
}
@@ -234,6 +240,41 @@ void blake3_keyed_128(const uint8_t key128[16],
234240
const uint8_t* data, size_t data_len,
235241
uint8_t* out, size_t out_len) noexcept;
236242

243+
/// Pre-expand a 128-bit key to a 256-bit BLAKE3 key: [K || K].
244+
///
245+
/// This avoids repeating the expansion when the same 128-bit key is used
246+
/// for multiple BLAKE3 calls within a single instruction (Phase F + Phase G).
247+
///
248+
/// @param key128 16-byte source key
249+
/// @param out256 32-byte output (caller must zero after use)
250+
void blake3_preexpand_128(const uint8_t key128[16],
251+
uint8_t out256[32]) noexcept;
252+
253+
/// BLAKE3 keyed hash using an already-expanded 256-bit key.
254+
///
255+
/// Same as blake3_keyed_128 but skips the [K||K] expansion step.
256+
///
257+
/// @param expanded_key 32-byte pre-expanded key (from blake3_preexpand_128)
258+
/// @param data input data
259+
/// @param data_len length of input data
260+
/// @param out output buffer for hash
261+
/// @param out_len number of output bytes
262+
void blake3_keyed_preexpanded(const uint8_t expanded_key[32],
263+
const uint8_t* data, size_t data_len,
264+
uint8_t* out, size_t out_len) noexcept;
265+
266+
/// Fingerprint all 16 encoded registers using a pre-expanded BLAKE3 key.
267+
///
268+
/// Same as blake3_keyed_fingerprint but avoids redundant key expansion
269+
/// when the caller has already called blake3_preexpand_128.
270+
///
271+
/// @param expanded_key 32-byte pre-expanded key
272+
/// @param encoded_regs array of 16 × uint64_t encoded register values
273+
/// @param out128 16-byte output fingerprint
274+
void blake3_keyed_fingerprint_preexpanded(const uint8_t expanded_key[32],
275+
const uint64_t encoded_regs[16],
276+
uint8_t out128[16]) noexcept;
277+
237278
/// Fingerprint all 16 encoded registers using BLAKE3_KEYED_128.
238279
///
239280
/// WHY FINGERPRINT ALL REGISTERS:

common/src/vm/vm_crypto.cpp

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,39 @@ void blake3_keyed_hash(const uint8_t key[32], const uint8_t* data, size_t data_l
2222
blake3_hasher_finalize(&hasher, out, out_len);
2323
}
2424

25+
void blake3_preexpand_128(const uint8_t key128[16],
26+
uint8_t out256[32]) noexcept {
27+
std::memcpy(out256, key128, 16);
28+
std::memcpy(out256 + 16, key128, 16);
29+
}
30+
31+
void blake3_keyed_preexpanded(const uint8_t expanded_key[32],
32+
const uint8_t* data, size_t data_len,
33+
uint8_t* out, size_t out_len) noexcept {
34+
blake3_hasher hasher;
35+
blake3_hasher_init_keyed(&hasher, expanded_key);
36+
blake3_hasher_update(&hasher, data, data_len);
37+
blake3_hasher_finalize(&hasher, out, out_len);
38+
}
39+
40+
void blake3_keyed_fingerprint_preexpanded(const uint8_t expanded_key[32],
41+
const uint64_t encoded_regs[16],
42+
uint8_t out128[16]) noexcept {
43+
blake3_keyed_preexpanded(expanded_key,
44+
reinterpret_cast<const uint8_t*>(encoded_regs),
45+
16 * sizeof(uint64_t),
46+
out128, 16);
47+
}
48+
2549
void blake3_keyed_128(const uint8_t key128[16],
2650
const uint8_t* data, size_t data_len,
2751
uint8_t* out, size_t out_len) noexcept {
2852
// Extend 128-bit key to 256-bit by repeating: [K || K].
2953
// Security level remains 128-bit (matching Speck64/128 key size).
3054
uint8_t extended[32];
31-
std::memcpy(extended, key128, 16);
32-
std::memcpy(extended + 16, key128, 16);
55+
blake3_preexpand_128(key128, extended);
3356

34-
blake3_hasher hasher;
35-
blake3_hasher_init_keyed(&hasher, extended);
36-
blake3_hasher_update(&hasher, data, data_len);
37-
blake3_hasher_finalize(&hasher, out, out_len);
57+
blake3_keyed_preexpanded(extended, data, data_len, out, out_len);
3858

3959
// Zero the extended key — prevent the redundant copy from persisting
4060
// in stack memory after this function returns.

runtime/include/vm_engine.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ using Common::VM::Crypto::FPE_Encode;
7070
using Common::VM::Crypto::FPE_Decode;
7171
using Common::VM::Crypto::blake3_keyed_128;
7272
using Common::VM::Crypto::blake3_keyed_fingerprint;
73+
using Common::VM::Crypto::blake3_preexpand_128;
74+
using Common::VM::Crypto::blake3_keyed_preexpanded;
75+
using Common::VM::Crypto::blake3_keyed_fingerprint_preexpanded;
7376
using Common::VM::secure_zero;
7477
using Common::VM::opcode_writes_reg;
7578

runtime/include/vm_state.hpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <cstdint>
4242
#include <cstddef>
4343
#include <cstring>
44+
#include <type_traits>
4445
#include <vector>
4546

4647
namespace VMPilot::Runtime {
@@ -130,6 +131,13 @@ struct VmImmutable {
130131
/// verify_bb_mac always iterates max_bb_insn_count times, with dummy
131132
/// SipHash iterations for indices beyond the actual BB.
132133
uint32_t max_bb_insn_count = 0;
134+
135+
/// O(1) bb_id -> bb_metadata index lookup table.
136+
///
137+
/// Dense vector sized to max_bb_id + 1, filled with UINT32_MAX (invalid).
138+
/// Replaces the linear scan in find_bb_index() — same semantics, O(1) cost.
139+
/// Built once during VmEngine::create().
140+
std::vector<uint32_t> bb_id_to_index;
133141
};
134142

135143
// ─────────────────────────────────────────────────────────────────────────────
@@ -233,6 +241,16 @@ static_assert(alignof(VmExecution) >= 64,
233241
static_assert(offsetof(VmExecution, regs) == 0,
234242
"regs must be at offset 0 (first cache line)");
235243

244+
// Verify RegVal layout is compatible with uint64_t for direct fingerprinting (P3).
245+
// This allows blake3_keyed_fingerprint to hash exec.regs[] directly without
246+
// copying into a temporary uint64_t[16] array.
247+
static_assert(sizeof(RegVal) == sizeof(uint64_t),
248+
"RegVal must be 8 bytes for direct fingerprint hashing");
249+
static_assert(std::is_standard_layout<RegVal>::value,
250+
"RegVal must be standard layout for safe reinterpret_cast");
251+
static_assert(sizeof(VmExecution::regs) == VM_REG_COUNT * sizeof(uint64_t),
252+
"regs array must be tightly packed (no padding)");
253+
236254
// ─────────────────────────────────────────────────────────────────────────────
237255
// VmEpoch — per-BB opcode permutation, trivially copyable
238256
// ─────────────────────────────────────────────────────────────────────────────

runtime/src/oram_strategies.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,22 @@ static uint64_t oram_access_impl(VmOramState& state, uint64_t addr,
4040

4141
uint64_t read_result = 0;
4242

43+
// P8: hoist temporary buffers out of the loop to reduce stack churn.
44+
// Reuse a single keystream buffer for both old and new expansions
45+
// (old_ks is consumed before new_ks is computed, so they can share).
46+
alignas(64) uint64_t words[8];
47+
uint64_t ks[8];
48+
4349
for (uint32_t line = 0; line < VM_ORAM_NUM_LINES; ++line) {
4450
uint8_t* line_ptr = state.workspace + line * VM_ORAM_LINE_SIZE;
4551

4652
// 1. Load 64 bytes
47-
uint64_t words[8];
4853
std::memcpy(words, line_ptr, 64);
4954

5055
// 2. Decrypt with old nonce
51-
uint64_t old_ks[8];
52-
siphash_expand(state.key, old_nonce, line, old_ks);
56+
siphash_expand(state.key, old_nonce, line, ks);
5357
for (int i = 0; i < 8; ++i)
54-
words[i] ^= old_ks[i];
58+
words[i] ^= ks[i];
5559

5660
// 3. Branchless read + conditional write
5761
const bool is_target_line = (line == target_line);
@@ -70,11 +74,10 @@ static uint64_t oram_access_impl(VmOramState& state, uint64_t addr,
7074
words[w] = (written & w_sel) | (words[w] & ~w_sel);
7175
}
7276

73-
// 4. Re-encrypt with fresh nonce
74-
uint64_t new_ks[8];
75-
siphash_expand(state.key, state.nonce, line, new_ks);
77+
// 4. Re-encrypt with fresh nonce (reuse ks buffer)
78+
siphash_expand(state.key, state.nonce, line, ks);
7679
for (int i = 0; i < 8; ++i)
77-
words[i] ^= new_ks[i];
80+
words[i] ^= ks[i];
7881

7982
// 5. Store back
8083
std::memcpy(line_ptr, words, 64);

runtime/src/pipeline.cpp

Lines changed: 19 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
#include <vm/secure_zero.hpp>
3131

3232
#include <cstring>
33-
#include <vector>
3433

3534
namespace VMPilot::Runtime::pipeline {
3635

@@ -63,13 +62,13 @@ static uint64_t update_enc_state_impl(uint64_t enc_state,
6362
return siphash_2_4(key, msg, 8);
6463
}
6564

66-
/// Find BB index by bb_id (linear search -- v1 simplicity).
65+
/// Find BB index by bb_id — O(1) dense vector lookup (P2).
66+
/// Returns -1 for unknown bb_id (same semantics as the original linear scan).
6767
static int find_bb_index(const VmImmutable& imm, uint32_t bb_id) noexcept {
68-
for (size_t i = 0; i < imm.bb_metadata.size(); ++i) {
69-
if (imm.bb_metadata[i].bb_id == bb_id)
70-
return static_cast<int>(i);
71-
}
72-
return -1;
68+
if (bb_id >= imm.bb_id_to_index.size())
69+
return -1;
70+
uint32_t idx = imm.bb_id_to_index[bb_id];
71+
return (idx == UINT32_MAX) ? -1 : static_cast<int>(idx);
7372
}
7473

7574
// ---------------------------------------------------------------------------
@@ -318,9 +317,8 @@ enter_basic_block(VmExecution& exec,
318317
const BBMetadata& target = imm.bb_metadata[static_cast<size_t>(bb_idx)];
319318

320319
// 2. Use pre-derived bb_enc_seed from BBMetadata (no stored_seed needed)
321-
uint64_t enc_seed_u64 = 0;
322-
std::memcpy(&enc_seed_u64, target.bb_enc_seed, 8);
323-
exec.enc_state = enc_seed_u64;
320+
// P6: pre-decoded at load time, avoids repeated memcpy
321+
exec.enc_state = target.bb_enc_seed_u64;
324322

325323
// 3. Reset instruction tracking
326324
exec.insn_index_in_bb = 0;
@@ -460,12 +458,8 @@ verify_bb_mac(const VmImmutable& imm,
460458
const uint32_t real_count = bb.insn_count_in_bb;
461459
const uint32_t padded_count = imm.max_bb_insn_count;
462460

463-
// Derive bb_enc_seed for this BB
464-
uint8_t enc_seed_bytes[8];
465-
std::memcpy(enc_seed_bytes, bb.bb_enc_seed, 8);
466-
467-
uint64_t enc_state = 0;
468-
std::memcpy(&enc_state, enc_seed_bytes, 8);
461+
// P6: use pre-decoded enc_state (no memcpy needed)
462+
uint64_t enc_state = bb.bb_enc_seed_u64;
469463

470464
// Re-decrypt all instructions in this BB + dummy iterations for padding.
471465
//
@@ -479,8 +473,10 @@ verify_bb_mac(const VmImmutable& imm,
479473
auto insns = imm.blob.instructions();
480474
const uint32_t total_insn_count = imm.blob.header().insn_count;
481475

482-
// Stack buffer for MAC computation (only real instructions contribute)
483-
std::vector<uint8_t> plaintext_bytes(real_count * 8);
476+
// P1: stack-allocated scratch buffer (no heap allocation on hot path).
477+
// Sized to compile-time cap; blobs exceeding VM_MAX_BB_INSN_CAP are
478+
// rejected at load time.
479+
uint8_t plaintext_bytes[VM_MAX_BB_INSN_CAP * 8];
484480

485481
for (uint32_t j = 0; j < padded_count; ++j) {
486482
const bool is_real = (j < real_count);
@@ -496,7 +492,7 @@ verify_bb_mac(const VmImmutable& imm,
496492

497493
// Only write real plaintext to MAC buffer
498494
if (is_real)
499-
std::memcpy(plaintext_bytes.data() + j * 8, &plain, 8);
495+
std::memcpy(plaintext_bytes + j * 8, &plain, 8);
500496

501497
// Always check REKEY (dummy iterations: sem_op won't match REKEY)
502498
VmInsn vinst{};
@@ -526,8 +522,8 @@ verify_bb_mac(const VmImmutable& imm,
526522
// MAC is computed over REAL instructions only (matches blob builder)
527523
uint8_t computed_mac[8];
528524
blake3_keyed_hash(imm.integrity_key,
529-
plaintext_bytes.data(),
530-
plaintext_bytes.size(),
525+
plaintext_bytes,
526+
static_cast<size_t>(real_count) * 8,
531527
computed_mac, 8);
532528

533529
// Compare with stored MAC (constant-time)
@@ -556,13 +552,8 @@ void replay_enc_state(VmExecution& exec, const VmEpoch& epoch,
556552

557553
const auto& bb = imm.bb_metadata[exec.current_bb_index];
558554

559-
// Derive bb_enc_seed from scratch (enter_basic_block already set enc_state,
560-
// but we need the seed for the keystream replay).
561-
uint8_t enc_seed_bytes[8];
562-
std::memcpy(enc_seed_bytes, bb.bb_enc_seed, 8);
563-
564-
uint64_t es = 0;
565-
std::memcpy(&es, enc_seed_bytes, 8);
555+
// P6: use pre-decoded enc_seed (no memcpy needed)
556+
uint64_t es = bb.bb_enc_seed_u64;
566557

567558
// Replay SipHash chain: decrypt each instruction [0..target_insn_idx) and
568559
// advance enc_state, handling REKEY mutations along the way.

0 commit comments

Comments
 (0)