Skip to content

Commit b3c5281

Browse files
authored
detection of interrupt-based bypasses to timing attacks
2 parents 8a7cadc + 54b488d commit b3c5281

2 files changed

Lines changed: 124 additions & 70 deletions

File tree

src/cli.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ class win_ansi_enabler_t
143143
};
144144
#endif
145145

146+
#ifdef __VMAWARE_DEBUG__
146147
struct SHA256 {
147148
u8 buf[64] = {}; // message block buffer
148149
u32 len = 0; // bytes currently in buf
@@ -320,6 +321,7 @@ static std::string compute_self_sha256() {
320321
}
321322
return out;
322323
}
324+
#endif
323325

324326
[[noreturn]] static void help(void) {
325327
std::cout <<

src/vmaware.hpp

Lines changed: 122 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -5383,13 +5383,12 @@ struct VM {
53835383

53845384
/**
53855385
* @brief Check for timing anomalies in the system
5386-
* @category x86, Windows
5386+
* @category Windows, x86
53875387
* @implements VM::TIMER
53885388
*/
53895389
[[nodiscard]] static bool timer() {
53905390
#if (x86 && WINDOWS)
5391-
// Detect a hypervisor without giving it time to react (when the hypervisor sees the vmexit, it's already too late for it, as the counter already exceeded the threshold)
5392-
// Uses our own software-based clock, meaning a hypervisor can't hide time by offsetting TSC or controlling any hardware timer
5391+
// The timing attack uses our own software-based clock, meaning a hypervisor can't hide time by offsetting TSC or controlling any other timer
53935392
double threshold = 3.5;
53945393
if (util::is_running_under_translator()) {
53955394
debug("TIMER: Running inside a binary translation layer");
@@ -5430,80 +5429,126 @@ struct VM {
54305429
return h;
54315430
}();
54325431

5433-
// search for the physical sibling of CPU 0, then pick a random CPU excluding it to avoid SMT locks
5434-
auto get_target_mask = []() -> DWORD_PTR {
5432+
// middle available logical CPU
5433+
auto get_trigger_mask = []() -> DWORD_PTR {
54355434
const HANDLE current_process = reinterpret_cast<HANDLE>(-1LL);
54365435

5437-
DWORD_PTR procMask = 0, sysMask = 0;
5438-
GetProcessAffinityMask(current_process, &procMask, &sysMask);
5436+
DWORD_PTR proc_mask = 0, sys_mask = 0;
5437+
GetProcessAffinityMask(current_process, &proc_mask, &sys_mask);
5438+
5439+
DWORD idxs[64]{};
5440+
DWORD n = 0;
5441+
for (DWORD i = 0; i < 64; ++i) {
5442+
if (proc_mask & (1ull << i)) {
5443+
idxs[n++] = i;
5444+
}
5445+
}
5446+
5447+
if (!n) return 1ull;
5448+
5449+
// middle available logical CPU because statistically it normally has less DPCs/interrupts, we could query the windows api to fetch the interrupt count or DPC time
5450+
// 20 available CPUs -> idxs[10] -> core 11 in 1-based numbering
5451+
const DWORD middle_pos = n / 2;
5452+
return 1ull << idxs[middle_pos];
5453+
};
5454+
5455+
// random logical CPU, but exclude the trigger_thread, first, second and last available logical CPUs, avoiding SMT siblings
5456+
auto get_counter_mask = []() -> DWORD_PTR {
5457+
const HANDLE current_process = reinterpret_cast<HANDLE>(-1LL);
5458+
DWORD_PTR proc_mask = 0, sys_mask = 0;
5459+
GetProcessAffinityMask(current_process, &proc_mask, &sys_mask);
54395460

5461+
// get topology to identify SMT siblings
54405462
DWORD len = 0;
54415463
GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &len);
54425464

5443-
BYTE stackBuf[1024]{};
5444-
BYTE* buf = stackBuf;
5465+
// stack buffer fallback mechanism
5466+
BYTE stack_buf[1024]{};
5467+
std::vector<BYTE> heap_buf;
5468+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = nullptr;
54455469

5446-
std::vector<BYTE> heapBuf;
5447-
if (len > sizeof(stackBuf)) {
5448-
heapBuf.resize(len);
5449-
buf = heapBuf.data();
5470+
if (len <= sizeof(stack_buf)) {
5471+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(stack_buf);
5472+
}
5473+
else {
5474+
heap_buf.resize(len);
5475+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(heap_buf.data());
54505476
}
54515477

5452-
GetLogicalProcessorInformationEx(RelationProcessorCore,
5453-
reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buf), &len);
5478+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, info, &len)) {
5479+
return 1ull; // fallback, it won't match the trigger_thread core
5480+
}
54545481

5455-
DWORD_PTR cpu0CoreMask = 0;
5456-
for (BYTE* p = buf; p < buf + len; ) {
5457-
const auto* r = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(p);
5458-
if (r->Relationship == RelationProcessorCore) {
5459-
for (WORD i = 0; i < r->Processor.GroupCount; ++i) {
5460-
const auto& m = r->Processor.GroupMask[i];
5461-
if (m.Group == 0 && (m.Mask & 1)) cpu0CoreMask |= m.Mask;
5482+
// map logical processor index to its physical core ID
5483+
DWORD logical_to_core[64] = { 0 };
5484+
DWORD_PTR core_mask[64] = { 0 };
5485+
size_t offset = 0;
5486+
DWORD core_idx = 0;
5487+
while (offset < len) {
5488+
auto ptr = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<BYTE*>(info) + offset);
5489+
for (DWORD i = 0; i < ptr->Processor.GroupCount; ++i) {
5490+
KAFFINITY mask = ptr->Processor.GroupMask[i].Mask;
5491+
for (int b = 0; b < 64; ++b) {
5492+
if (mask & (1ull << b)) {
5493+
logical_to_core[b] = core_idx;
5494+
core_mask[core_idx] |= (1ull << b);
5495+
}
54625496
}
54635497
}
5464-
p += r->Size;
5498+
offset += ptr->Size;
5499+
core_idx++;
54655500
}
54665501

5467-
const DWORD_PTR choices = procMask & ~cpu0CoreMask;
5468-
5502+
// available logical processors
54695503
DWORD idxs[64]{}, n = 0;
5470-
for (DWORD i = 0; i < 64; ++i)
5471-
if (choices & (1ull << i)) idxs[n++] = i;
5504+
for (DWORD i = 0; i < 64; ++i) {
5505+
if (proc_mask & (1ull << i)) idxs[n++] = i;
5506+
}
5507+
if (!n) return 1ull;
5508+
5509+
// exclusions of cores that statistically have a lot of DPCs/interrupts, we could fetch them with the windows api too by DPC time and interrupt count
5510+
const DWORD middle_pos = n / 2;
5511+
DWORD_PTR choices = proc_mask;
5512+
5513+
if (n >= 1) choices &= ~(1ull << idxs[0]); // first
5514+
if (n >= 2) choices &= ~(1ull << idxs[1]); // second
5515+
if (n >= 1) choices &= ~(1ull << idxs[n - 1]); // last
5516+
5517+
// exclude middle core (because it is where the trigger_thread runs) + avoid SMT siblings of that core
5518+
DWORD middle_logical = idxs[middle_pos];
5519+
DWORD middle_core_id = logical_to_core[middle_logical];
5520+
choices &= ~core_mask[middle_core_id];
5521+
5522+
// if exclusions leave nothing, fall back to the full mask
5523+
if (!choices) choices = proc_mask;
5524+
5525+
// random selection
5526+
DWORD pick[64]{}, m = 0;
5527+
for (DWORD i = 0; i < 64; ++i) {
5528+
if (choices & (1ull << i)) pick[m++] = i;
5529+
}
5530+
if (!m) return 1ull;
54725531

54735532
// random so that the hypervisor doesn't know where the counter thread is
5474-
// this will affect latency if cache lines from trigger_thread and counter_thread are separated
5533+
// this will affect latency if cache lines from trigger_thread and counter_thread are separated enough due to cores being too distant
54755534
// however, we do a ratio based detection, so this wont affect the detection accuracy because the cache latency affects both samples
5476-
if (n) {
5477-
// std::random_device{}() uses RDRAND/RDSEED which can be intercepted by hypervisors
5478-
// we use our own compile-time seed that cannot be taken by examining PE/Linux binary properties and would need static/dynamic analysis
5479-
// this changes per build and per process session due to hardware ASLR
5480-
u64 seed = 0;
5481-
seed ^= static_cast<u64>(ct_seed);
5482-
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&current_process));
5483-
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&procMask)) << 1;
5484-
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&sysMask)) << 2;
5485-
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&len)) << 3;
5486-
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&stackBuf[0])) << 4;
5487-
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&heapBuf)) << 5;
5488-
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&buf)) << 6;
5489-
5490-
seed ^= seed >> 33;
5491-
seed *= 0xff51afd7ed558ccdULL;
5492-
seed ^= seed >> 33;
5493-
seed *= 0xc4ceb9fe1a85ec53ULL;
5494-
seed ^= seed >> 33;
5495-
5496-
std::seed_seq seq{
5497-
static_cast<u32>(seed),
5498-
static_cast<u32>(seed >> 32),
5499-
static_cast<u32>(seed ^ 0x9e3779b9u),
5500-
ct_seed
5501-
};
5535+
u64 seed = 0;
5536+
seed ^= static_cast<u64>(ct_seed);
5537+
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&current_process));
5538+
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&proc_mask)) << 1;
5539+
seed ^= static_cast<u64>(reinterpret_cast<std::uintptr_t>(&sys_mask)) << 2;
5540+
seed ^= seed >> 33;
5541+
seed *= 0xff51afd7ed558ccdULL;
5542+
seed ^= seed >> 33;
5543+
seed *= 0xc4ceb9fe1a85ec53ULL;
5544+
seed ^= seed >> 33;
5545+
std::seed_seq seq{ static_cast<u32>(seed), static_cast<u32>(seed >> 32), static_cast<u32>(seed ^ 0x9e3779b9u), ct_seed };
55025546

5503-
std::mt19937 gen(seq);
5504-
return 1ull << idxs[std::uniform_int_distribution<u32>(0, n - 1)(gen)];
5505-
}
5506-
return 1ull;
5547+
// std::random_device{}() uses RDRAND/RDSEED which can be intercepted by hypervisors
5548+
// we use our own compile-time seed that cannot be taken by examining PE/Linux binary properties and would need static/dynamic analysis
5549+
// this changes per build and per process session due to hardware ASLR
5550+
std::mt19937 gen(seq);
5551+
return 1ull << pick[std::uniform_int_distribution<u32>(0, m - 1)(gen)];
55075552
};
55085553

55095554
// we dont use cpu::cpuid on purpose
@@ -5531,7 +5576,7 @@ struct VM {
55315576
#endif
55325577
#else
55335578
i32 dummy[4];
5534-
__cpuidex(dummy, 0x0, 0);
5579+
__cpuidex(dummy, 0x0, 0); // leaf 0 because it's the most stable one for making ratio checks, even if at first glance it may be abusable because it's the fastest one
55355580
#endif
55365581
};
55375582

@@ -5554,7 +5599,7 @@ struct VM {
55545599
execute_lfence_8();
55555600
};
55565601

5557-
const DWORD_PTR target_affinity = get_target_mask();
5602+
const DWORD_PTR target_affinity = get_counter_mask();
55585603

55595604
// our software clock, it will count how many cycles a vmexit takes
55605605
auto counter_thread = [&]() {
@@ -5730,14 +5775,14 @@ struct VM {
57305775

57315776
const HANDLE current_thread = reinterpret_cast<HANDLE>(-2LL);
57325777
const HANDLE current_process = reinterpret_cast<HANDLE>(-1LL);
5733-
const DWORD_PTR old_affinity = SetThreadAffinityMask(current_thread, 1);
5778+
const DWORD_PTR old_affinity = get_trigger_mask();
57345779
const int old_thread_priority = GetThreadPriority(current_thread);
57355780
const DWORD old_process_priority = GetPriorityClass(current_process);
57365781
SetPriorityClass(current_process, ABOVE_NORMAL_PRIORITY_CLASS); // ABOVE_NORMAL_PRIORITY_CLASS + THREAD_PRIORITY_HIGHEST = 12 base priority
57375782
SetThreadPriority(current_thread, THREAD_PRIORITY_HIGHEST);
57385783
SetThreadPriorityBoost(current_thread, TRUE); // disable dynamic boosts
57395784

5740-
// so that hypervisor can't predict how many samples we will collect
5785+
// important so that hypervisor can't predict how many samples we will collect
57415786
// stack-only / ASLR-derived component (no APIs, no rdtsc)
57425787
u64 seed = 0;
57435788
seed ^= static_cast<u64>(ct_seed);
@@ -5777,7 +5822,7 @@ struct VM {
57775822
VirtualLock(vm_samples.data(), BATCH_SIZE * sizeof(u64)); // lock the memory for the samples to prevent page faults if permissions are enough
57785823
VirtualLock(ref_samples.data(), BATCH_SIZE * sizeof(u64));
57795824

5780-
state.start_test.store(true, std::memory_order_release); // _mm_pause can be exited conditionally, spam hit L3
5825+
state.start_test.store(true, std::memory_order_release); // _mm_pause can be vm-exited conditionally, spam hit L3
57815826
// warm-up to settle caches, scheduler and frequency boosts
57825827
for (int i = 0; i < 1000; ++i) {
57835828
for (int j = 0; j < 2; ++j) trigger_vmexit();
@@ -5789,6 +5834,7 @@ struct VM {
57895834
// cpuid and lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally
57905835
u64 v_pre, v_post, r_pre, r_post, sync;
57915836

5837+
// this is done as a counter to both legitimate and malicious hypervisors interrupts that may pause the counter thread while we measure
57925838
sync = state.counter; while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled)
57935839
sync = state.counter; while (state.counter == sync); // fastest busy-waiting strategy, PAUSE affects cache, calling APIs like SwitchToThread() would be even worse
57945840

@@ -5797,7 +5843,7 @@ struct VM {
57975843
v_pre = state.counter;
57985844
std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences
57995845

5800-
trigger_vmexit(); // this forces the hypervisor to keep interception and try to bypass latency, or disable interception and try to bypass XSAVE states
5846+
trigger_vmexit(); // this forces the hypervisor to keep interception and try to bypass latency, or disable interception if on AMD and try to bypass XSAVE states
58015847

58025848
std::atomic_signal_fence(std::memory_order_seq_cst);
58035849
v_post = state.counter;
@@ -5817,6 +5863,7 @@ struct VM {
58175863
sync = state.counter; while (state.counter == sync); // sync to our counter tick again
58185864
sync = state.counter; while (state.counter == sync);
58195865

5866+
// LFENCE check is after CPUID on purpose, so that possible artificial pauses when cpuid is executed affect LFENCE too due to the latency of sending a IPI
58205867
if (!apply_multiplier) {
58215868
r_pre = state.counter;
58225869
std::atomic_signal_fence(std::memory_order_seq_cst); // ensure compiler-level ordering
@@ -5831,7 +5878,7 @@ struct VM {
58315878
r_pre = state.counter;
58325879
std::atomic_signal_fence(std::memory_order_seq_cst);
58335880

5834-
// scaled if counter thread is not able to increment in time due to CPUID being too fast
5881+
// scaled if counter thread is not able to increment in time due to cache ownership invalidation
58355882
execute_lfence_16();
58365883

58375884
std::atomic_signal_fence(std::memory_order_seq_cst);
@@ -5846,10 +5893,7 @@ struct VM {
58465893
}
58475894
else if (v_post <= v_pre && !apply_multiplier) {
58485895
invalid++;
5849-
if (invalid >= 250) {
5850-
debug("TIMER: Detected trigger thread monopolizing cache ownership; unstable path was activated to increase performance");
5851-
apply_multiplier = true;
5852-
}
5896+
if (invalid >= 250) apply_multiplier = true;
58535897
}
58545898
}
58555899

@@ -5860,9 +5904,17 @@ struct VM {
58605904
const double latency_ratio = ref_l ? (double)cpuid_l / (double)ref_l : 0;
58615905

58625906
// VMM = Time spent in hypervisor; nVMM = Time spent in baremetal
5863-
debug("TIMER: VMM -> ", cpuid_l, " | nVMM -> ", ref_l, " | Ratio -> ", latency_ratio);
5907+
debug("TIMER: VMM -> ", cpuid_l, " | nVMM -> ", ref_l, " | Ratio -> ", latency_ratio); // those are NOT cycles
58645908
if (latency_ratio >= threshold) hypervisor_detected = true;
58655909

5910+
// Detect IPI-based counter pausing bypasses
5911+
// For the median itself to exceed baremetal limits (which rarely pass 1000), an interrupt must be occurring on almost EVERY single loop iteration
5912+
// This is the footprint of a hypervisor continuously spamming cross-core IPIs to try and pause the counter thread (or the trigger_thread to make LFENCE take a lot of time)
5913+
if (!hypervisor_detected && (cpuid_l > 1000 || ref_l > 1000)) {
5914+
debug("TIMER: Detected artificial IPI delivery to VMAware's threads");
5915+
bypass_detected = true;
5916+
}
5917+
58665918
// Now detect bypassers disabling cpuid interception with SVM
58675919
// Even when a bypasser disables INTERCEPT_CPUID in the VMCB, they often fail to realize that certain CPUID leaves do not return static values from the hardware
58685920
// Instead, they return values based on the LAPIC state or internal CPU registers that the hypervisor must initialize for the vCPU to function

0 commit comments

Comments
 (0)