You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
// the reason why we use CPUID rather than RDTSC is because RDTSC is a conditionally exiting instruction, and you can modify the guest TSC without trapping it
4786
4775
auto vm_exit = []() noexcept -> u64 {
4787
4776
volatileint regs[4] = { 0 }; // doesn't need to be as elaborated as the next cpuid_lambda we will use to calculate the real latency
@@ -4804,72 +4793,66 @@ struct VM {
4804
4793
volatilefn_t xor_ptr = +xor_lambda;
4805
4794
volatileu64 dummy = 0;
4806
4795
4807
-
// run the XOR loop briefly to force CPU out of sleep states/lower frequencies
4808
-
// This reduces the variance (jitter) between the two measurement loops
4809
-
// and confuses hypervisors targetting this check who might try to advance TSC when XOR might be running
4810
-
for (ULONG64 x = 0; x < 10000000; ++x) {
4811
-
dummy += xor_ptr();
4812
-
}
4796
+
// 6 ticks * 15.6ms ~= 100ms
4797
+
auto accumulate_and_measure = [&](volatilefn_t func_ptr) -> u64 {
4798
+
u64 total_tsc = 0;
4799
+
u64 total_qit = 0;
4800
+
u64 ticks_captured = 0;
4801
+
constexpru64 TARGET_TICKS = 6;
4802
+
4803
+
// We continue until we have captured enough full tick windows
4804
+
while (ticks_captured < TARGET_TICKS) {
4805
+
u64 start_wait, now_wait;
4806
+
4807
+
// Wait for QIT tick edge to avoid granularity errors
4808
+
// syncing ensures we always start the measurement at the exact edge of a QIT update, eliminating jitter
4809
+
QueryInterruptTime(&start_wait);
4810
+
do {
4811
+
_mm_pause(); // hint to CPU we-re spin-waiting
4812
+
QueryInterruptTime(&now_wait); // never touches RDTSC/RDTSCP or transitions to kernel-mode, just reads from KUSER_SHARED_DATA
4813
+
} while (now_wait == start_wait);
4814
+
4815
+
// start of a new tick window
4816
+
constu64 qit_start = now_wait;
4817
+
constu64 tsc_start = __rdtsc();
4818
+
4819
+
u64 qit_current;
4820
+
// run until the tick updates again
4821
+
do {
4822
+
// unroll slightly to reduce overhead
4823
+
dummy += func_ptr(); dummy += func_ptr();
4824
+
dummy += func_ptr(); dummy += func_ptr();
4825
+
dummy += func_ptr(); dummy += func_ptr();
4826
+
4827
+
QueryInterruptTime(&qit_current);
4828
+
} while (qit_current == qit_start);
4829
+
4830
+
// end of tick window
4831
+
constu64 tsc_end = __rdtsc();
4832
+
4833
+
constu64 delta_qit = qit_current - qit_start;
4834
+
constu64 delta_tsc = tsc_end - tsc_start;
4835
+
4836
+
// we need to accumulate results, the more we do it, the more the hypervisor will downclock the TSC
4837
+
if (delta_qit > 0) {
4838
+
total_qit += delta_qit;
4839
+
total_tsc += delta_tsc;
4840
+
ticks_captured++;
4841
+
}
4842
+
}
4813
4843
4814
-
// first measurement
4815
-
ULONG64 beforeqit = 0;
4816
-
// Wait for QIT tick edge to avoid granularity errors
4817
-
// if our loop takes 20ms, we might capture one tick (15.6ms reported) or two ticks (31.2ms reported) depending on exactly when we started relative to the system timer interrupt
4818
-
// this causes the denominator in our ratio to jump by 50-100%, causing a delta artifact of exactly 32 (that would still be too small for the ratio diff to trigger, but anyways)
4819
-
// syncing ensures we always start the measurement at the exact edge of a QIT update, eliminating this jitter
4820
-
{
4821
-
ULONG64 start_wait, now_wait;
4822
-
QueryInterruptTime(&start_wait);
4823
-
do {
4824
-
_mm_pause(); // hint to CPU we-re spin-waiting
4825
-
QueryInterruptTime(&now_wait); // never touches RDTSC/RDTSCP or transitions to kernel-mode, just reads from KUSER_SHARED_DATA, reason why we use it
4826
-
} while (now_wait == start_wait);
4827
-
beforeqit = now_wait;
4828
-
}
4829
-
// using only one rdtsc call here is harmless
4830
-
// and it is intentional instead of manually computing with the frequency we got before (to avoid spoofing)
4831
-
const ULONG64 beforetsc = __rdtsc();
4832
-
4833
-
for (ULONG64 x = 0; x < count_first; ++x) {
4834
-
dummy += cp_ptr(); // this loop will be intercepted by a RDTSC trap, downscaling our TSC
4835
-
}
4836
-
4837
-
// the kernel routine that backs up this api runs at CLOCK_LEVEL(13), only preempted by IPI, POWER_LEVEL and NMIs
4838
-
// meaning it's highly accurate even with kernel noise, hence we don't need cluster or median computations to get precise ratios
mask = -(uint64_t)(firstRatio < secondRatio) -> 0 or 0xFFFFFFFFFFFFFFFF
@@ -4895,9 +4878,8 @@ struct VM {
4895
4878
// contrary to what someone could think, under heavy load the ratio will be more close to 0, it will also be closer to 0 if we assign CPUs to a VM in our host machine
4896
4879
// it will increase if the BIOS/UEFI is configured to run the TSC by "core usage", which is why we use this threshold check based on a lot of empirical data
4897
4880
// it increases because the CPUID instruction forces the CPU pipeline to drain and serialize (heavy workload), while the XOR loop is a tight arithmetic loop (throughput workload).
4898
-
// CPUs will boost to different frequencies for these two scenarios (for example 4.2GHz for XOR vs 4.0GHz for CPUID)
4881
+
// CPUs will boost to different frequencies for these two scenarios
4899
4882
// A difference of 5-10% in ratio (15-30 points) or even more is normal behavior on bare metal
4900
-
// lastly, we might see a small ratio always depending on which part of the tick we exactly start the measurement, which is the most important reason why we need a threshold
4901
4883
if (difference >= 100) {
4902
4884
debug("TIMER: An hypervisor has been detected intercepting TSC");
4903
4885
returntrue; // both ratios will always differ if TSC is downscaled, since the hypervisor can't account for the XOR/NOP loop
@@ -4919,7 +4901,7 @@ struct VM {
4919
4901
_mm_lfence();
4920
4902
4921
4903
// read start time
4922
-
u64 t1 = __rdtsc();
4904
+
constu64 t1 = __rdtsc();
4923
4905
4924
4906
// prevent the compiler from moving the __cpuid call before the t1 read
4925
4907
COMPILER_BARRIER();
@@ -4929,7 +4911,7 @@ struct VM {
4929
4911
COMPILER_BARRIER();
4930
4912
4931
4913
// the idea is to let rdtscp internally wait until cpuid is executed rather than using another memory barrier
4932
-
u64 t2 = __rdtscp(&aux);
4914
+
constu64 t2 = __rdtscp(&aux);
4933
4915
4934
4916
// ensure the read of t2 doesn't bleed into future instructions
4935
4917
_mm_lfence();
@@ -4949,7 +4931,7 @@ struct VM {
4949
4931
volatileunsignedint a, b, c, d;
4950
4932
4951
4933
// this differs from the code above because a, b, c and d are effectively "used"
4952
-
//because the compiler must honor the write to a volatile variable.
4934
+
// the compiler must honor the write to a volatile variable
0 commit comments