You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: src/vmaware.hpp
+82-39Lines changed: 82 additions & 39 deletions
Original file line number
Diff line number
Diff line change
@@ -4602,7 +4602,7 @@ struct VM {
4602
4602
// will be used in cpuid measurements later
4603
4603
u16 cycle_threshold = 800;
4604
4604
if (util::hyper_x() == HYPERV_ARTIFACT_VM) {
4605
-
cycle_threshold = 3500; // if we're running under Hyper-V, make VMAware detect nested virtualization
4605
+
cycle_threshold = 3250; // if we're running under Hyper-V, make VMAware detect nested virtualization
4606
4606
}
4607
4607
4608
4608
#if (WINDOWS)
@@ -4767,12 +4767,12 @@ struct VM {
4767
4767
}
4768
4768
}
4769
4769
4770
-
// RDTSC trap detection
4771
-
// This detection uses two clocks and two loops, a loop that the hypervisor can spoof and a loop that the hypervisor cannot
4772
-
// When RDTSC is hooked, the hypervisor usually "downscales" the result to hide the time passed or doesnt let TSC advance for the time it was vm-exiting
4770
+
/* TSC offseting detection*/
4771
+
// This detection uses two clocks and two loops, a loop and a timer that the hypervisor can spoof and a second loop/timer that the hypervisor cannot
4772
+
// When the TSC is "hooked", the hypervisor usually downscales the result to hide the time passed or doesnt let TSC advance for the time it was vm-exiting
4773
4773
// However, the hypervisor have absolutely no way to downscale time for the second loop because it runs natively on the CPU without exiting
4774
-
// This creates a discrepancy in the ratio of both loops
4775
-
// The hypervisor cannot easily rewind the system wall clock (second loop, QIT/KUSER_SHARED_DATA) without causing system instability (network timeouts, audio lag)
4774
+
// This creates a massive discrepancy in the ratio of both loops, contrary to the very small ratio if both timers were to run normally
4775
+
// The hypervisor cannot easily rewind the system wall clock (second loop, QIT/KUSER_SHARED_DATA) without causing system instability (network timeouts, audio lag, etc)
4776
4776
staticthread_localvolatileu64 g_sink = 0; // thread_local volatile so that it doesnt need to be captured by the lambda
4777
4777
4778
4778
// First we start by randomizing counts WITHOUT syscalls and WITHOUT using instructions that can be trapped by hypervisors, this was a hard task
@@ -4833,7 +4833,7 @@ struct VM {
4833
4833
};
4834
4834
4835
4835
// Use rejection sampling as before to avoid modulo bias
// the reason why we use CPUID rather than RDTSC is because RDTSC is a conditionally exiting instruction, and you can modify the guest TSC without trapping it
4863
+
auto vm_exit = []() noexcept -> u64 {
4864
+
volatileint regs[4] = { 0 }; // doesn't need to be as elaborated as the next cpuid_lambda we will use to calculate the real latency
4865
+
__cpuid((int*)regs, 0); // unconditional vmexit
4866
+
return (u64)regs[0]; // dependency to avoid /O2 builds, so that the CPU cannot start the next iteration of the loop until the current __cpuid writes to regs
4858
4867
};
4859
4868
4860
4869
auto xor_lambda = []() noexcept -> u64 {
4861
-
volatileu64 a = 0xDEADBEEFDEADBEEFull; // can be replaced by NOPs, the core idea is to use a non-trappable instruction that the hv cannot virtualize
4870
+
volatileu64 a = 0xDEADBEEFDEADBEEFull; // can be replaced with NOPs, etc, the core idea is to use a non-trappable instruction that the hv cannot virtualize
4862
4871
volatileu64 b = 0x1234567890ABCDEFull;
4863
4872
u64 v = a ^ b;
4864
4873
g_sink ^= v;
@@ -4868,17 +4877,38 @@ struct VM {
4868
4877
usingfn_t = u64(*)();
4869
4878
4870
4879
// make the pointer volatile so the compiler treats the call as opaque/indirect
4871
-
volatilefn_trd_ptr = +rd_lambda; // +lambda forces conversion to function ptr, so it won't be inlined, we need to prevent the compiler from inlining this
4880
+
volatilefn_tcp_ptr = +vm_exit; // +lambda forces conversion to function ptr, so it won't be inlined, we need to prevent the compiler from inlining this
4872
4881
volatilefn_t xor_ptr = +xor_lambda;
4882
+
volatileu64 dummy = 0;
4883
+
4884
+
// run the XOR loop briefly to force CPU out of sleep states/lower frequencies
4885
+
// This reduces the variance (jitter) between the two measurement loops
4886
+
// and confuses hypervisors targetting this check who might try to advance TSC when XOR might be running
4887
+
for (ULONG64 x = 0; x < 10000000; ++x) {
4888
+
dummy += xor_ptr();
4889
+
}
4873
4890
4874
4891
// first measurement
4875
4892
ULONG64 beforeqit = 0;
4876
-
QueryInterruptTime(&beforeqit); // never touches RDTSC/RDTSCP or transitions to kernel-mode, just reads from KUSER_SHARED_DATA, reason why we use it
4877
-
const ULONG64 beforetsc = __rdtsc();
4893
+
// Wait for QIT tick edge to avoid granularity errors
4894
+
// if our loop takes 20ms, we might capture one tick (15.6ms reported) or two ticks (31.2ms reported) depending on exactly when we started relative to the system timer interrupt
4895
+
// this causes the denominator in our ratio to jump by 50-100%, causing a delta artifact of exactly 32 (that would still be too small for the ratio diff to trigger, but anyways)
4896
+
// syncing ensures we always start the measurement at the exact edge of a QIT update, eliminating this jitter
4897
+
{
4898
+
ULONG64 start_wait, now_wait;
4899
+
QueryInterruptTime(&start_wait);
4900
+
do {
4901
+
_mm_pause(); // hint to CPU we-re spin-waiting
4902
+
QueryInterruptTime(&now_wait); // never touches RDTSC/RDTSCP or transitions to kernel-mode, just reads from KUSER_SHARED_DATA, reason why we use it
4903
+
} while (now_wait == start_wait);
4904
+
beforeqit = now_wait;
4905
+
}
4906
+
// using only one rdtsc call here is harmless
4907
+
// and it is intentional instead of manually computing with the frequency we got before (to avoid spoofing)
4908
+
const ULONG64 beforetsc = __rdtsc();
4878
4909
4879
-
volatileu64 dummy = 0;
4880
4910
for (ULONG64 x = 0; x < count_first; ++x) {
4881
-
dummy = rd_ptr(); // this loop will be intercepted by a RDTSC trap, downscaling our TSC
4911
+
dummy += cp_ptr(); // this loop will be intercepted by a RDTSC trap, downscaling our TSC
4882
4912
}
4883
4913
4884
4914
// the kernel routine that backs up this api runs at CLOCK_LEVEL(13), only preempted by IPI, POWER_LEVEL and NMIs
@@ -4893,11 +4923,20 @@ struct VM {
4893
4923
4894
4924
// second measurement
4895
4925
ULONG64 beforeqit2 = 0;
4896
-
QueryInterruptTime(&beforeqit2);
4926
+
// wait for QIT tick edge for the second measurement as well
4927
+
{
4928
+
ULONG64 start_wait, now_wait;
4929
+
QueryInterruptTime(&start_wait);
4930
+
do {
4931
+
_mm_pause();
4932
+
QueryInterruptTime(&now_wait);
4933
+
} while (now_wait == start_wait);
4934
+
beforeqit2 = now_wait;
4935
+
}
4897
4936
const ULONG64 beforetsc2 = __rdtsc();
4898
4937
4899
4938
for (ULONG64 x = 0; x < count_second; ++x) {
4900
-
dummy = xor_ptr(); // this loop won't be intercepted, it never switches to kernel-mode
4939
+
dummy += xor_ptr(); // this loop won't be intercepted, it never switches to kernel-mode
4901
4940
}
4902
4941
VMAWARE_UNUSED(dummy);
4903
4942
@@ -4911,14 +4950,14 @@ struct VM {
4911
4950
4912
4951
/* branchless absolute difference is like:
4913
4952
mask = -(uint64_t)(firstRatio < secondRatio) -> 0 or 0xFFFFFFFFFFFFFFFF
// contrary to what someone could think, under heavy load the ratio will be more close to 0, it will also be closer to 0 if we assign CPUs to a VM in our host machine
4934
-
// it will increase if the BIOS is configured to run the TSC by "core usage", which is why we use a 100 threshold check based on a lot of empirical data
4935
-
if (difference > 100) {
4936
-
debug("TIMER: An hypervisor has been detected intercepting RDTSC");
4937
-
returntrue; // both ratios will always differ if a RDTSC trap is present, since the hypervisor can't account for the XOR/NOP loop
4973
+
// it will increase if the BIOS/UEFI is configured to run the TSC by "core usage", which is why we use this threshold check based on a lot of empirical data
4974
+
// it increases because the CPUID instruction forces the CPU pipeline to drain and serialize (heavy workload), while the XOR loop is a tight arithmetic loop (throughput workload).
4975
+
// CPUs will boost to different frequencies for these two scenarios (for example 4.2GHz for XOR vs 4.0GHz for CPUID)
4976
+
// A difference of 5-10% in ratio (15-30 points) or even more is normal behavior on bare metal
4977
+
// lastly, we might see a small ratio always depending on which part of the tick we exactly start the measurement, which is the most important reason why we need a threshold
4978
+
if (difference >= 100) {
4979
+
debug("TIMER: An hypervisor has been detected intercepting TSC");
4980
+
returntrue; // both ratios will always differ if TSC is downscaled, since the hypervisor can't account for the XOR/NOP loop
4938
4981
}
4939
4982
#endif
4940
4983
@@ -5078,7 +5121,11 @@ struct VM {
5078
5121
return result;
5079
5122
};
5080
5123
5081
-
// intel leaves on AMD and viceversa will still work for this probe
5124
+
// Intel leaves on an AMD CPU and viceversa will still work for this probe
5125
+
// for leafs like 0 that just returns static data, like "AuthenticAMD" or "GenuineIntel", a fast exit path could be made
5126
+
// for other leaves like the extended state that rely on dynamic system states like APIC IDs and XState, kernel data locks are required
5127
+
// we try different leaves so that is not worth to just create a "fast" exit path, forcing guest TSC manipulation
5128
+
// the vmexit itself has a latency of around 800 cycles, combined with the registers save and the cpuid information we require, it costs 1000+ cycles
0 commit comments