Skip to content

Commit a6d15bb

Browse files
author
Requiem
committed
feat: improved cpuid trap check
1 parent 0d7252c commit a6d15bb

1 file changed

Lines changed: 44 additions & 114 deletions

File tree

src/vmaware.hpp

Lines changed: 44 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -4943,7 +4943,7 @@ struct VM {
49434943
// we used a rng before running the traditional rdtsc-cpuid-rdtsc trick
49444944

49454945
// sometimes not intercepted in some hvs (like VirtualBox) under compat mode
4946-
auto cpuid_ex = [&](int leaf, int subleaf) noexcept -> u64 {
4946+
auto cpuid = [&](unsigned int leaf) noexcept -> u64 {
49474947
#if (MSVC)
49484948
// make regs volatile so writes cannot be optimized out, if this isn't added and the code is compiled in release mode, cycles would be around 40 even under Hyper-V
49494949
volatile int regs[4]{};
@@ -4956,7 +4956,7 @@ struct VM {
49564956
// prevent the compiler from moving the __cpuid call before the t1 read
49574957
COMPILER_BARRIER();
49584958

4959-
__cpuidex((int*)regs, leaf, subleaf);
4959+
__cpuid((int*)regs, static_cast<int>(leaf)); // not using cpu::cpuid to get a chance of inlining
49604960

49614961
COMPILER_BARRIER();
49624962

@@ -4984,7 +4984,7 @@ struct VM {
49844984
// because the compiler must honor the write to a volatile variable.
49854985
asm volatile("cpuid"
49864986
: "=a"(a), "=b"(b), "=c"(c), "=d"(d)
4987-
: "a"(leaf), "c"(subleaf)
4987+
: "a"(leaf)
49884988
: "memory");
49894989

49904990
COMPILER_BARRIER();
@@ -4999,8 +4999,6 @@ struct VM {
49994999
#endif
50005000
};
50015001

5002-
constexpr u16 iterations = 1000;
5003-
50045002
auto calculate_latency = [&](const std::vector<u64>& samples_in) -> u64 {
50055003
if (samples_in.empty()) return 0;
50065004
const size_t N = samples_in.size();
@@ -5079,10 +5077,28 @@ struct VM {
50795077
return result;
50805078
};
50815079

5080+
// intel leaves on AMD and viceversa will still work for this probe
5081+
constexpr unsigned int leaves[] = {
5082+
0xB, // topology
5083+
0xD, // xsave/xstate
5084+
0x4, // deterministic cache params
5085+
0x1, // basic features
5086+
0x7, // extended features
5087+
0xA, // architectural performance monitoring
5088+
0x12, // SGX/enclave
5089+
0x5, // MONITOR/MWAIT
5090+
0x40000000u, // hypervisor range start
5091+
0x80000008u, // extended address limits (amd/intel ext)
5092+
0x0 // fallback to leaf 0 occasionally
5093+
};
5094+
constexpr size_t n_leaves = sizeof(leaves) / sizeof(leaves[0]);
5095+
5096+
constexpr u16 iterations = 1000;
5097+
50825098
// pre-allocate sample buffer and touch pages to avoid page faults by MMU during measurement
50835099
std::vector<u64> samples;
5084-
samples.resize(iterations);
5085-
for (unsigned i = 0; i < iterations; ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset)
5100+
samples.resize(n_leaves * iterations);
5101+
for (size_t i = 0; i < samples.size(); ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset)
50865102

50875103
/*
50885104
* We want to move our thread from the Running state to the Waiting state
@@ -5097,126 +5113,40 @@ struct VM {
50975113
* This gives us more time for sampling before we're rescheduled again
50985114
*/
50995115

5100-
#if (WINDOWS)
5101-
// voluntary context switch to get a fresh quantum
5102-
SleepEx(1, FALSE);
5103-
#else
5104-
// should work similarly in Unix-like operating systems
5105-
std::this_thread::sleep_for(std::chrono::milliseconds(1));
5106-
#endif
5116+
#if (WINDOWS)
5117+
// voluntary context switch to get a fresh quantum
5118+
SleepEx(1, FALSE);
5119+
#else
5120+
// should work similarly in Unix-like operating systems
5121+
std::this_thread::sleep_for(std::chrono::milliseconds(1));
5122+
#endif
5123+
5124+
// warm up but rotating through leaves to exercise different cpuid paths
51075125
for (int w = 0; w < 128; ++w) {
5108-
volatile u64 tmp = cpuid_ex(0, 0);
5126+
volatile u64 tmp = cpuid(leaves[w % n_leaves]);
51095127
VMAWARE_UNUSED(tmp);
51105128
}
51115129

5112-
for (unsigned i = 0; i < iterations; ++i) {
5113-
samples[i] = cpuid_ex(0, 0); // leaf 0 just returns static data so it should be fast
5114-
}
5115-
5116-
const u64 cpuid_latency_leaf0 = calculate_latency(samples);
5117-
5118-
// Extended Topology requires the hypervisor to calculate dynamic x2APIC IDs
5119-
// we expect this to crash entire VMs if the kernel developer is not enough
5120-
for (unsigned i = 0; i < iterations; ++i) {
5121-
samples[i] = cpuid_ex(0xB, 0);
5130+
// 1000 iterations per leaf, store contiguously per-leaf
5131+
for (size_t li = 0; li < n_leaves; ++li) {
5132+
const unsigned int leaf = leaves[li];
5133+
for (unsigned i = 0; i < iterations; ++i) {
5134+
samples[li * iterations + i] = cpuid(leaf);
5135+
}
51225136
}
5123-
const u64 cpuid_latency_leafB = calculate_latency(samples);
51245137

5125-
debug("TIMER: Leaf 0 latency -> ", cpuid_latency_leaf0);
5126-
debug("TIMER: Leaf 0xB latency -> ", cpuid_latency_leafB);
5138+
const u64 cpuid_latency = calculate_latency(samples);
51275139

5128-
// simple differential analysis
5129-
if (cpuid_latency_leaf0 > 0) {
5130-
if (cpuid_latency_leafB > (cpuid_latency_leaf0 * 1.6)) {
5131-
debug("TIMER: VMAware detected a CPUID patch");
5132-
return true;
5133-
}
5134-
}
5140+
debug("TIMER: VMEXIT latency -> ", cpuid_latency);
51355141

5136-
if (cpuid_latency_leaf0 >= cycle_threshold) {
5137-
return true;
5138-
}
5139-
if (cpuid_latency_leafB >= cycle_threshold) {
5142+
if (cpuid_latency >= cycle_threshold) {
51405143
return true;
51415144
}
5142-
else if (cpuid_latency_leaf0 <= 20) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios
5145+
else if (cpuid_latency <= 20) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios
51435146
return true;
51445147
}
5145-
5146-
// the core idea is to force the host scheduler's pending signal check (kvm_vcpu_check_block)
5147-
// We detect cpuid patches that just do fast vmexits by spawning a thread on the SAME core that spams the patched instruction
5148-
// If patched, the host core enters an uninterruptible loop, starving the timer interrupt needed for the sleep syscall
5149-
#if (WINDOWS)
5150-
{
5151-
using NtCreateThreadEx_t = NTSTATUS(__stdcall*)(PHANDLE, ACCESS_MASK, PVOID, HANDLE, PVOID, PVOID, ULONG, ULONG_PTR, ULONG_PTR, ULONG_PTR, PVOID);
5152-
using NtTerminateThread_t = NTSTATUS(__stdcall*)(HANDLE, NTSTATUS);
5153-
using NtWaitForSingleObject_t = NTSTATUS(__stdcall*)(HANDLE, BOOLEAN, PLARGE_INTEGER);
5154-
5155-
const HMODULE ntdll = util::get_ntdll();
5156-
if (ntdll) {
5157-
const char* names[] = { "NtCreateThreadEx", "NtTerminateThread", "NtWaitForSingleObject" };
5158-
void* funcs[3] = {};
5159-
util::get_function_address(ntdll, names, funcs, 3);
5160-
5161-
auto pNtCreateThreadEx = (NtCreateThreadEx_t)funcs[0];
5162-
auto pNtTerminateThread = (NtTerminateThread_t)funcs[1];
5163-
auto pNtWaitForSingleObject = (NtWaitForSingleObject_t)funcs[2];
5164-
5165-
if (pNtCreateThreadEx && pNtTerminateThread && pNtWaitForSingleObject) {
5166-
5167-
// stateless lambda castable to thread routine
5168-
auto spammer_routine = [](PVOID) -> DWORD {
5169-
// This loop exploits the patch's lack of interrupt window checking
5170-
while (true) {
5171-
int regs[4];
5172-
__cpuid(regs, 0);
5173-
}
5174-
return 0;
5175-
};
5176-
5177-
HANDLE hSpammer = nullptr;
5178-
const NTSTATUS status = pNtCreateThreadEx(&hSpammer, MAXIMUM_ALLOWED, nullptr, GetCurrentProcess(),
5179-
(PVOID)(uintptr_t(+spammer_routine)), nullptr, TRUE, 0, 0, 0, nullptr);
5180-
5181-
if (status >= 0 && hSpammer) {
5182-
// forcing contention contention
5183-
THREAD_BASIC_INFORMATION tbi_local{};
5184-
if (pNtQueryInformationThread(hCurrentThread, ThreadBasicInformation, &tbi_local, sizeof(tbi_local), nullptr) >= 0) {
5185-
pNtSetInformationThread(hSpammer, ThreadAffinityMask, &tbi_local.AffinityMask, sizeof(ULONG_PTR));
5186-
}
5187-
5188-
ResumeThread(hSpammer);
5189-
5190-
LARGE_INTEGER qpc_start, qpc_end, qpc_freq;
5191-
QueryPerformanceFrequency(&qpc_freq);
5192-
QueryPerformanceCounter(&qpc_start);
5193-
5194-
// expecting gibberish cpuid patches to lock the interrupt timer
5195-
// by the infinite fastpath loop on the physical core, causing a massive overshoot
5196-
SleepEx(10, FALSE);
5197-
5198-
QueryPerformanceCounter(&qpc_end);
5199-
5200-
// Cleanup
5201-
pNtTerminateThread(hSpammer, 0);
5202-
pNtWaitForSingleObject(hSpammer, FALSE, nullptr);
5203-
CloseHandle(hSpammer);
5204-
5205-
double elapsed_ms = (double)(qpc_end.QuadPart - qpc_start.QuadPart) * 1000.0 / (double)qpc_freq.QuadPart;
5206-
5207-
debug("TIMER: Timer interrupt starvation -> ", elapsed_ms, " ms");
5208-
5209-
if (elapsed_ms > 40.0) {
5210-
debug("TIMER: VMAware detected a CPUID patch");
5211-
return true;
5212-
}
5213-
}
5214-
}
5215-
}
5216-
}
5217-
#endif
52185148
// TLB flushes or side channel cache attacks are not even tried due to how unreliable they are against stealthy hypervisors
5219-
#endif
5149+
#endif
52205150
return false;
52215151
}
52225152

0 commit comments

Comments
 (0)