@@ -4943,7 +4943,7 @@ struct VM {
49434943 // we used a rng before running the traditional rdtsc-cpuid-rdtsc trick
49444944
49454945 // sometimes not intercepted in some hvs (like VirtualBox) under compat mode
4946- auto cpuid_ex = [&](int leaf, int subleaf ) noexcept -> u64 {
4946+ auto cpuid = [&](unsigned int leaf ) noexcept -> u64 {
49474947 #if (MSVC)
49484948 // make regs volatile so writes cannot be optimized out, if this isn't added and the code is compiled in release mode, cycles would be around 40 even under Hyper-V
49494949 volatile int regs[4 ]{};
@@ -4956,7 +4956,7 @@ struct VM {
49564956 // prevent the compiler from moving the __cpuid call before the t1 read
49574957 COMPILER_BARRIER ();
49584958
4959- __cpuidex ((int *)regs, leaf, subleaf);
4959+ __cpuid ((int *)regs, static_cast < int >( leaf)); // not using cpu::cpuid to get a chance of inlining
49604960
49614961 COMPILER_BARRIER ();
49624962
@@ -4984,7 +4984,7 @@ struct VM {
49844984 // because the compiler must honor the write to a volatile variable.
49854985 asm volatile (" cpuid"
49864986 : " =a" (a), " =b" (b), " =c" (c), " =d" (d)
4987- : " a" (leaf), " c " (subleaf)
4987+ : " a" (leaf)
49884988 : " memory" );
49894989
49904990 COMPILER_BARRIER ();
@@ -4999,8 +4999,6 @@ struct VM {
49994999 #endif
50005000 };
50015001
5002- constexpr u16 iterations = 1000 ;
5003-
50045002 auto calculate_latency = [&](const std::vector<u64 >& samples_in) -> u64 {
50055003 if (samples_in.empty ()) return 0 ;
50065004 const size_t N = samples_in.size ();
@@ -5079,10 +5077,28 @@ struct VM {
50795077 return result;
50805078 };
50815079
5080+ // intel leaves on AMD and viceversa will still work for this probe
5081+ constexpr unsigned int leaves[] = {
5082+ 0xB , // topology
5083+ 0xD , // xsave/xstate
5084+ 0x4 , // deterministic cache params
5085+ 0x1 , // basic features
5086+ 0x7 , // extended features
5087+ 0xA , // architectural performance monitoring
5088+ 0x12 , // SGX/enclave
5089+ 0x5 , // MONITOR/MWAIT
5090+ 0x40000000u , // hypervisor range start
5091+ 0x80000008u , // extended address limits (amd/intel ext)
5092+ 0x0 // fallback to leaf 0 occasionally
5093+ };
5094+ constexpr size_t n_leaves = sizeof (leaves) / sizeof (leaves[0 ]);
5095+
5096+ constexpr u16 iterations = 1000 ;
5097+
50825098 // pre-allocate sample buffer and touch pages to avoid page faults by MMU during measurement
50835099 std::vector<u64 > samples;
5084- samples.resize(iterations);
5085- for (unsigned i = 0 ; i < iterations ; ++i) samples[i] = 0 ; // or RtlSecureZeroMemory (memset)
5100+ samples.resize(n_leaves * iterations);
5101+ for (size_t i = 0 ; i < samples.size() ; ++i) samples[i] = 0 ; // or RtlSecureZeroMemory (memset)
50865102
50875103 /*
50885104 * We want to move our thread from the Running state to the Waiting state
@@ -5097,126 +5113,40 @@ struct VM {
50975113 * This gives us more time for sampling before we're rescheduled again
50985114 */
50995115
5100- #if (WINDOWS)
5101- // voluntary context switch to get a fresh quantum
5102- SleepEx (1 , FALSE );
5103- #else
5104- // should work similarly in Unix-like operating systems
5105- std::this_thread::sleep_for (std::chrono::milliseconds(1 ));
5106- #endif
5116+ #if (WINDOWS)
5117+ // voluntary context switch to get a fresh quantum
5118+ SleepEx (1 , FALSE );
5119+ #else
5120+ // should work similarly in Unix-like operating systems
5121+ std::this_thread::sleep_for (std::chrono::milliseconds(1 ));
5122+ #endif
5123+
5124+ // warm up but rotating through leaves to exercise different cpuid paths
51075125 for (int w = 0 ; w < 128 ; ++w) {
5108- volatile u64 tmp = cpuid_ex ( 0 , 0 );
5126+ volatile u64 tmp = cpuid (leaves[w % n_leaves] );
51095127 VMAWARE_UNUSED (tmp);
51105128 }
51115129
5112- for (unsigned i = 0 ; i < iterations; ++i) {
5113- samples[i] = cpuid_ex (0 , 0 ); // leaf 0 just returns static data so it should be fast
5114- }
5115-
5116- const u64 cpuid_latency_leaf0 = calculate_latency(samples);
5117-
5118- // Extended Topology requires the hypervisor to calculate dynamic x2APIC IDs
5119- // we expect this to crash entire VMs if the kernel developer is not enough
5120- for (unsigned i = 0 ; i < iterations; ++i) {
5121- samples[i] = cpuid_ex (0xB , 0 );
5130+ // 1000 iterations per leaf, store contiguously per-leaf
5131+ for (size_t li = 0 ; li < n_leaves; ++li) {
5132+ const unsigned int leaf = leaves[li];
5133+ for (unsigned i = 0 ; i < iterations; ++i) {
5134+ samples[li * iterations + i] = cpuid (leaf);
5135+ }
51225136 }
5123- const u64 cpuid_latency_leafB = calculate_latency(samples);
51245137
5125- debug (" TIMER: Leaf 0 latency -> " , cpuid_latency_leaf0);
5126- debug (" TIMER: Leaf 0xB latency -> " , cpuid_latency_leafB);
5138+ const u64 cpuid_latency = calculate_latency(samples);
51275139
5128- // simple differential analysis
5129- if (cpuid_latency_leaf0 > 0 ) {
5130- if (cpuid_latency_leafB > (cpuid_latency_leaf0 * 1.6 )) {
5131- debug (" TIMER: VMAware detected a CPUID patch" );
5132- return true ;
5133- }
5134- }
5140+ debug (" TIMER: VMEXIT latency -> " , cpuid_latency);
51355141
5136- if (cpuid_latency_leaf0 >= cycle_threshold) {
5137- return true ;
5138- }
5139- if (cpuid_latency_leafB >= cycle_threshold) {
5142+ if (cpuid_latency >= cycle_threshold) {
51405143 return true ;
51415144 }
5142- else if (cpuid_latency_leaf0 <= 20 ) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios
5145+ else if (cpuid_latency <= 20 ) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios
51435146 return true ;
51445147 }
5145-
5146- // the core idea is to force the host scheduler's pending signal check (kvm_vcpu_check_block)
5147- // We detect cpuid patches that just do fast vmexits by spawning a thread on the SAME core that spams the patched instruction
5148- // If patched, the host core enters an uninterruptible loop, starving the timer interrupt needed for the sleep syscall
5149- #if (WINDOWS)
5150- {
5151- using NtCreateThreadEx_t = NTSTATUS (__stdcall*)(PHANDLE, ACCESS_MASK, PVOID, HANDLE, PVOID, PVOID, ULONG, ULONG_PTR, ULONG_PTR, ULONG_PTR, PVOID);
5152- using NtTerminateThread_t = NTSTATUS (__stdcall*)(HANDLE, NTSTATUS);
5153- using NtWaitForSingleObject_t = NTSTATUS (__stdcall*)(HANDLE, BOOLEAN, PLARGE_INTEGER);
5154-
5155- const HMODULE ntdll = util::get_ntdll ();
5156- if (ntdll) {
5157- const char * names[] = { " NtCreateThreadEx" , " NtTerminateThread" , " NtWaitForSingleObject" };
5158- void * funcs[3 ] = {};
5159- util::get_function_address (ntdll, names, funcs, 3 );
5160-
5161- auto pNtCreateThreadEx = (NtCreateThreadEx_t)funcs[0 ];
5162- auto pNtTerminateThread = (NtTerminateThread_t)funcs[1 ];
5163- auto pNtWaitForSingleObject = (NtWaitForSingleObject_t)funcs[2 ];
5164-
5165- if (pNtCreateThreadEx && pNtTerminateThread && pNtWaitForSingleObject) {
5166-
5167- // stateless lambda castable to thread routine
5168- auto spammer_routine = [](PVOID) -> DWORD {
5169- // This loop exploits the patch's lack of interrupt window checking
5170- while (true ) {
5171- int regs[4 ];
5172- __cpuid (regs, 0 );
5173- }
5174- return 0 ;
5175- };
5176-
5177- HANDLE hSpammer = nullptr ;
5178- const NTSTATUS status = pNtCreateThreadEx (&hSpammer, MAXIMUM_ALLOWED, nullptr , GetCurrentProcess (),
5179- (PVOID)(uintptr_t (+spammer_routine)), nullptr , TRUE , 0 , 0 , 0 , nullptr );
5180-
5181- if (status >= 0 && hSpammer) {
5182- // forcing contention contention
5183- THREAD_BASIC_INFORMATION tbi_local{};
5184- if (pNtQueryInformationThread (hCurrentThread, ThreadBasicInformation, &tbi_local, sizeof (tbi_local), nullptr ) >= 0 ) {
5185- pNtSetInformationThread (hSpammer, ThreadAffinityMask, &tbi_local.AffinityMask , sizeof (ULONG_PTR));
5186- }
5187-
5188- ResumeThread (hSpammer);
5189-
5190- LARGE_INTEGER qpc_start, qpc_end, qpc_freq;
5191- QueryPerformanceFrequency (&qpc_freq);
5192- QueryPerformanceCounter (&qpc_start);
5193-
5194- // expecting gibberish cpuid patches to lock the interrupt timer
5195- // by the infinite fastpath loop on the physical core, causing a massive overshoot
5196- SleepEx (10 , FALSE );
5197-
5198- QueryPerformanceCounter (&qpc_end);
5199-
5200- // Cleanup
5201- pNtTerminateThread (hSpammer, 0 );
5202- pNtWaitForSingleObject (hSpammer, FALSE , nullptr );
5203- CloseHandle (hSpammer);
5204-
5205- double elapsed_ms = (double )(qpc_end.QuadPart - qpc_start.QuadPart ) * 1000.0 / (double )qpc_freq.QuadPart ;
5206-
5207- debug (" TIMER: Timer interrupt starvation -> " , elapsed_ms, " ms" );
5208-
5209- if (elapsed_ms > 40.0 ) {
5210- debug (" TIMER: VMAware detected a CPUID patch" );
5211- return true ;
5212- }
5213- }
5214- }
5215- }
5216- }
5217- #endif
52185148 // TLB flushes or side channel cache attacks are not even tried due to how unreliable they are against stealthy hypervisors
5219- #endif
5149+ #endif
52205150 return false ;
52215151 }
52225152
0 commit comments