Skip to content

Commit 0e27024

Browse files
author
Requiem
committed
feat: entropy provider generation for vmexit latency calculation
1 parent 0a4d5c8 commit 0e27024

File tree

1 file changed

+75
-74
lines changed

1 file changed

+75
-74
lines changed

src/vmaware.hpp

Lines changed: 75 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -4698,79 +4698,6 @@ struct VM {
46984698
// The hypervisor cannot easily rewind the system wall clock (second loop, QIT/KUSER_SHARED_DATA) without causing system instability (network timeouts, audio lag, etc)
46994699
static thread_local volatile u64 g_sink = 0; // thread_local volatile so that it doesnt need to be captured by the lambda
47004700

4701-
// First we start by randomizing counts WITHOUT syscalls and WITHOUT using instructions that can be trapped by hypervisors, this was a hard task
4702-
struct entropy_provider {
4703-
// prevent inlining so optimizer can't fold this easily
4704-
#if (MSVC && !CLANG)
4705-
__declspec(noinline)
4706-
#else
4707-
__attribute__((noinline))
4708-
#endif
4709-
ULONG64 operator()() const noexcept {
4710-
// TO prevent hoisting across this call
4711-
std::atomic_signal_fence(std::memory_order_seq_cst);
4712-
4713-
// start state (golden ratio)
4714-
volatile ULONG64 v = 0x9E3779B97F4A7C15ULL;
4715-
4716-
// mix in addresses (ASLR gives entropy but if ASLR disabled or bypassed we have some tricks still)
4717-
// Take addresses of various locals/statics and mark some volatile so they cannot be optimized away
4718-
volatile int local_static = 0; // local volatile (stack-like)
4719-
static volatile int module_static = 0; // static in function scope (image address)
4720-
auto probe_lambda = []() noexcept {}; // stack-local lambda object
4721-
uintptr_t pa = reinterpret_cast<uintptr_t>(&v);
4722-
uintptr_t pb = reinterpret_cast<uintptr_t>(&local_static);
4723-
uintptr_t pc = reinterpret_cast<uintptr_t>(&module_static);
4724-
uintptr_t pd = reinterpret_cast<uintptr_t>(&probe_lambda);
4725-
4726-
v ^= static_cast<ULONG64>(pa) + 0x9E3779B97F4A7C15ULL + (v << 6) + (v >> 2);
4727-
v ^= static_cast<ULONG64>(pb) + (v << 7);
4728-
v ^= static_cast<ULONG64>(pc) + (v >> 11);
4729-
v ^= static_cast<ULONG64>(pd) + 0xBF58476D1CE4E5B9ULL;
4730-
4731-
// dependent operations on volatile locals to prevent elimination
4732-
for (int i = 0; i < 24; ++i) {
4733-
volatile int stack_local = i ^ static_cast<int>(v);
4734-
// take address each iteration and fold it in
4735-
uintptr_t la = reinterpret_cast<uintptr_t>(&stack_local);
4736-
v ^= (static_cast<ULONG64>(la) + (static_cast<ULONG64>(i) * 0x9E3779B97F4A7CULL));
4737-
// dependent shifts to spread any small differences
4738-
v ^= (v << ((i & 31)));
4739-
v ^= (v >> (((i + 13) & 31)));
4740-
// so compiler can't remove the local entirely
4741-
std::atomic_signal_fence(std::memory_order_seq_cst);
4742-
}
4743-
4744-
// final avalanche! (as said before, just in case ASLR can be folded)
4745-
v ^= (v << 13);
4746-
v ^= (v >> 7);
4747-
v ^= (v << 17);
4748-
v *= 0x2545F4914F6CDD1DULL;
4749-
v ^= (v >> 33);
4750-
4751-
// another compiler fence to prevent hoisting results
4752-
std::atomic_signal_fence(std::memory_order_seq_cst);
4753-
4754-
return static_cast<ULONG64>(v);
4755-
}
4756-
};
4757-
4758-
// Use rejection sampling as before to avoid modulo bias
4759-
auto rng = [](ULONG64 min, ULONG64 max, auto getrand) noexcept -> ULONG64 {
4760-
const ULONG64 range = max - min + 1;
4761-
const ULONG64 limit = (~0ULL) - ((~0ULL) % range);
4762-
for (;;) {
4763-
const ULONG64 r = getrand();
4764-
if (r < limit) return min + (r % range);
4765-
// small local mix to change subsequent outputs (still in user-mode and not a syscall)
4766-
volatile ULONG64 scrub = r;
4767-
scrub ^= (scrub << 11);
4768-
scrub ^= (scrub >> 9);
4769-
(void)scrub;
4770-
}
4771-
};
4772-
4773-
const entropy_provider entropyProv{};
47744701
// the reason why we use CPUID rather than RDTSC is because RDTSC is a conditionally exiting instruction, and you can modify the guest TSC without trapping it
47754702
auto vm_exit = []() noexcept -> u64 {
47764703
volatile int regs[4] = { 0 }; // doesn't need to be as elaborated as the next cpuid_lambda we will use to calculate the real latency
@@ -5078,6 +5005,80 @@ struct VM {
50785005
return result;
50795006
};
50805007

5008+
// First we start by randomizing counts WITHOUT syscalls and WITHOUT using instructions that can be trapped by hypervisors, this was a hard task
5009+
struct entropy_provider {
5010+
// prevent inlining so optimizer can't fold this easily
5011+
#if (MSVC && !CLANG)
5012+
__declspec(noinline)
5013+
#else
5014+
__attribute__((noinline))
5015+
#endif
5016+
ULONG64 operator()() const noexcept {
5017+
// TO prevent hoisting across this call
5018+
std::atomic_signal_fence(std::memory_order_seq_cst);
5019+
5020+
// start state (golden ratio)
5021+
volatile ULONG64 v = 0x9E3779B97F4A7C15ULL;
5022+
5023+
// mix in addresses (ASLR gives entropy but if ASLR disabled or bypassed we have some tricks still)
5024+
// Take addresses of various locals/statics and mark some volatile so they cannot be optimized away
5025+
volatile int local_static = 0; // local volatile (stack-like)
5026+
static volatile int module_static = 0; // static in function scope (image address)
5027+
auto probe_lambda = []() noexcept {}; // stack-local lambda object
5028+
uintptr_t pa = reinterpret_cast<uintptr_t>(&v);
5029+
uintptr_t pb = reinterpret_cast<uintptr_t>(&local_static);
5030+
uintptr_t pc = reinterpret_cast<uintptr_t>(&module_static);
5031+
uintptr_t pd = reinterpret_cast<uintptr_t>(&probe_lambda);
5032+
5033+
v ^= static_cast<ULONG64>(pa) + 0x9E3779B97F4A7C15ULL + (v << 6) + (v >> 2);
5034+
v ^= static_cast<ULONG64>(pb) + (v << 7);
5035+
v ^= static_cast<ULONG64>(pc) + (v >> 11);
5036+
v ^= static_cast<ULONG64>(pd) + 0xBF58476D1CE4E5B9ULL;
5037+
5038+
// dependent operations on volatile locals to prevent elimination
5039+
for (int i = 0; i < 24; ++i) {
5040+
volatile int stack_local = i ^ static_cast<int>(v);
5041+
// take address each iteration and fold it in
5042+
uintptr_t la = reinterpret_cast<uintptr_t>(&stack_local);
5043+
v ^= (static_cast<ULONG64>(la) + (static_cast<ULONG64>(i) * 0x9E3779B97F4A7CULL));
5044+
// dependent shifts to spread any small differences
5045+
v ^= (v << ((i & 31)));
5046+
v ^= (v >> (((i + 13) & 31)));
5047+
// so compiler can't remove the local entirely
5048+
std::atomic_signal_fence(std::memory_order_seq_cst);
5049+
}
5050+
5051+
// final avalanche! (as said before, just in case ASLR can be folded)
5052+
v ^= (v << 13);
5053+
v ^= (v >> 7);
5054+
v ^= (v << 17);
5055+
v *= 0x2545F4914F6CDD1DULL;
5056+
v ^= (v >> 33);
5057+
5058+
// another compiler fence to prevent hoisting results
5059+
std::atomic_signal_fence(std::memory_order_seq_cst);
5060+
5061+
return static_cast<ULONG64>(v);
5062+
}
5063+
};
5064+
5065+
// rejection sampling as before to avoid modulo bias
5066+
auto rng = [](ULONG64 min, ULONG64 max, auto getrand) noexcept -> ULONG64 {
5067+
const ULONG64 range = max - min + 1;
5068+
const ULONG64 limit = (~0ULL) - ((~0ULL) % range);
5069+
for (;;) {
5070+
const ULONG64 r = getrand();
5071+
if (r < limit) return min + (r % range);
5072+
// small local mix to change subsequent outputs (still in user-mode and not a syscall)
5073+
volatile ULONG64 scrub = r;
5074+
scrub ^= (scrub << 11);
5075+
scrub ^= (scrub >> 9);
5076+
(void)scrub;
5077+
}
5078+
};
5079+
5080+
const entropy_provider entropyProv{};
5081+
50815082
// Intel leaves on an AMD CPU and viceversa will still work for this probe
50825083
// for leafs like 0 that just returns static data, like "AuthenticAMD" or "GenuineIntel", a fast exit path could be made
50835084
// for other leaves like the extended state that rely on dynamic system states like APIC IDs and XState, kernel data locks are required
@@ -5098,7 +5099,7 @@ struct VM {
50985099
};
50995100
constexpr size_t n_leaves = sizeof(leaves) / sizeof(leaves[0]);
51005101

5101-
constexpr u16 iterations = 100;
5102+
const size_t iterations = static_cast<size_t>(rng(100, 200, [&entropyProv]() noexcept { return entropyProv(); }));
51025103

51035104
// pre-allocate sample buffer and touch pages to avoid page faults by MMU during measurement
51045105
std::vector<u64> samples;

0 commit comments

Comments
 (0)