Skip to content

Commit 0a4d5c8

Browse files
author
Requiem
committed
fix: false flags on XOR vs __cpuid comparisons
1 parent 411f781 commit 0a4d5c8

File tree

1 file changed

+65
-83
lines changed

1 file changed

+65
-83
lines changed

src/vmaware.hpp

Lines changed: 65 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -4602,7 +4602,7 @@ struct VM {
46024602
return false;
46034603
}
46044604
// will be used in cpuid measurements later
4605-
u16 cycle_threshold = 800; // average latency of a VMX/SVM vmexit
4605+
u16 cycle_threshold = 800; // average latency of a VMX/SVM VMEXIT
46064606
if (util::hyper_x() == HYPERV_ARTIFACT_VM) {
46074607
cycle_threshold = 3250; // if we're running under Hyper-V, make VMAware detect nested virtualization
46084608
}
@@ -4771,17 +4771,6 @@ struct VM {
47714771
};
47724772

47734773
const entropy_provider entropyProv{};
4774-
// QueryInterruptTime uses 100-nanosecond units, but the value in KUSER_SHARED_DATA is only updated every 15.625ms (the default system timer tick)
4775-
// For example, 10000000 iterations at @ 3GHz is around 33ms
4776-
// 100000 iterations of XOR would run too quickly (0.03ms), we need at least one tick, so we put 60000000 (20-30ms avg)
4777-
// 100000 iterations of CPUID takes roughly 100ms-200ms (which is safe for QIT)
4778-
// However, we need both loops to execute for roughly the same Wall Clock time (~60-80ms) to ensure CPU Frequency (Turbo) remains consistent
4779-
// 1. CPUID Loop: 130000 iterations * 2000 cycles = 260M cycles
4780-
// 2. XOR Loop: 260M cycles / 18 cycles per iter (volatile overhead) = 14.5M iterations
4781-
// the goal is to ensure we pass the QIT 15.6ms resolution update threshold from usermode while minimizing thermal frequency drift
4782-
const ULONG64 count_first = rng(130000ULL, 135000ULL, [&entropyProv]() noexcept { return entropyProv(); });
4783-
const ULONG64 count_second = rng(14000000ULL, 15000000ULL, [&entropyProv]() noexcept { return entropyProv(); });
4784-
47854774
// the reason why we use CPUID rather than RDTSC is because RDTSC is a conditionally exiting instruction, and you can modify the guest TSC without trapping it
47864775
auto vm_exit = []() noexcept -> u64 {
47874776
volatile int regs[4] = { 0 }; // doesn't need to be as elaborated as the next cpuid_lambda we will use to calculate the real latency
@@ -4804,72 +4793,66 @@ struct VM {
48044793
volatile fn_t xor_ptr = +xor_lambda;
48054794
volatile u64 dummy = 0;
48064795

4807-
// run the XOR loop briefly to force CPU out of sleep states/lower frequencies
4808-
// This reduces the variance (jitter) between the two measurement loops
4809-
// and confuses hypervisors targetting this check who might try to advance TSC when XOR might be running
4810-
for (ULONG64 x = 0; x < 10000000; ++x) {
4811-
dummy += xor_ptr();
4812-
}
4796+
// 6 ticks * 15.6ms ~= 100ms
4797+
auto accumulate_and_measure = [&](volatile fn_t func_ptr) -> u64 {
4798+
u64 total_tsc = 0;
4799+
u64 total_qit = 0;
4800+
u64 ticks_captured = 0;
4801+
constexpr u64 TARGET_TICKS = 6;
4802+
4803+
// We continue until we have captured enough full tick windows
4804+
while (ticks_captured < TARGET_TICKS) {
4805+
u64 start_wait, now_wait;
4806+
4807+
// Wait for QIT tick edge to avoid granularity errors
4808+
// syncing ensures we always start the measurement at the exact edge of a QIT update, eliminating jitter
4809+
QueryInterruptTime(&start_wait);
4810+
do {
4811+
_mm_pause(); // hint to CPU we-re spin-waiting
4812+
QueryInterruptTime(&now_wait); // never touches RDTSC/RDTSCP or transitions to kernel-mode, just reads from KUSER_SHARED_DATA
4813+
} while (now_wait == start_wait);
4814+
4815+
// start of a new tick window
4816+
const u64 qit_start = now_wait;
4817+
const u64 tsc_start = __rdtsc();
4818+
4819+
u64 qit_current;
4820+
// run until the tick updates again
4821+
do {
4822+
// unroll slightly to reduce overhead
4823+
dummy += func_ptr(); dummy += func_ptr();
4824+
dummy += func_ptr(); dummy += func_ptr();
4825+
dummy += func_ptr(); dummy += func_ptr();
4826+
4827+
QueryInterruptTime(&qit_current);
4828+
} while (qit_current == qit_start);
4829+
4830+
// end of tick window
4831+
const u64 tsc_end = __rdtsc();
4832+
4833+
const u64 delta_qit = qit_current - qit_start;
4834+
const u64 delta_tsc = tsc_end - tsc_start;
4835+
4836+
// we need to accumulate results, the more we do it, the more the hypervisor will downclock the TSC
4837+
if (delta_qit > 0) {
4838+
total_qit += delta_qit;
4839+
total_tsc += delta_tsc;
4840+
ticks_captured++;
4841+
}
4842+
}
48134843

4814-
// first measurement
4815-
ULONG64 beforeqit = 0;
4816-
// Wait for QIT tick edge to avoid granularity errors
4817-
// if our loop takes 20ms, we might capture one tick (15.6ms reported) or two ticks (31.2ms reported) depending on exactly when we started relative to the system timer interrupt
4818-
// this causes the denominator in our ratio to jump by 50-100%, causing a delta artifact of exactly 32 (that would still be too small for the ratio diff to trigger, but anyways)
4819-
// syncing ensures we always start the measurement at the exact edge of a QIT update, eliminating this jitter
4820-
{
4821-
ULONG64 start_wait, now_wait;
4822-
QueryInterruptTime(&start_wait);
4823-
do {
4824-
_mm_pause(); // hint to CPU we-re spin-waiting
4825-
QueryInterruptTime(&now_wait); // never touches RDTSC/RDTSCP or transitions to kernel-mode, just reads from KUSER_SHARED_DATA, reason why we use it
4826-
} while (now_wait == start_wait);
4827-
beforeqit = now_wait;
4828-
}
4829-
// using only one rdtsc call here is harmless
4830-
// and it is intentional instead of manually computing with the frequency we got before (to avoid spoofing)
4831-
const ULONG64 beforetsc = __rdtsc();
4832-
4833-
for (ULONG64 x = 0; x < count_first; ++x) {
4834-
dummy += cp_ptr(); // this loop will be intercepted by a RDTSC trap, downscaling our TSC
4835-
}
4836-
4837-
// the kernel routine that backs up this api runs at CLOCK_LEVEL(13), only preempted by IPI, POWER_LEVEL and NMIs
4838-
// meaning it's highly accurate even with kernel noise, hence we don't need cluster or median computations to get precise ratios
4839-
ULONG64 afterqit = 0;
4840-
QueryInterruptTime(&afterqit);
4841-
const ULONG64 aftertsc = __rdtsc();
4842-
4843-
const ULONG64 dtsc1 = aftertsc - beforetsc;
4844-
const ULONG64 dtq1 = afterqit - beforeqit;
4845-
const ULONG64 firstRatio = (dtq1 != 0) ? (dtsc1 / dtq1) : 0ULL;
4846-
4847-
// second measurement
4848-
ULONG64 beforeqit2 = 0;
4849-
// wait for QIT tick edge for the second measurement as well
4850-
{
4851-
ULONG64 start_wait, now_wait;
4852-
QueryInterruptTime(&start_wait);
4853-
do {
4854-
_mm_pause();
4855-
QueryInterruptTime(&now_wait);
4856-
} while (now_wait == start_wait);
4857-
beforeqit2 = now_wait;
4858-
}
4859-
const ULONG64 beforetsc2 = __rdtsc();
4844+
// Total TSC Cycles / Total QIT Units
4845+
if (total_qit == 0) return 0;
4846+
return total_tsc / total_qit;
4847+
};
48604848

4861-
for (ULONG64 x = 0; x < count_second; ++x) {
4862-
dummy += xor_ptr(); // this loop won't be intercepted, it never switches to kernel-mode
4863-
}
4864-
VMAWARE_UNUSED(dummy);
4849+
// first measurement (CPUID / VMEXIT)
4850+
const ULONG64 firstRatio = accumulate_and_measure(cp_ptr);
48654851

4866-
ULONG64 afterqit2 = 0;
4867-
QueryInterruptTime(&afterqit2);
4868-
const ULONG64 aftertsc2 = __rdtsc();
4852+
// second measurement (XOR / ALU)
4853+
const ULONG64 secondRatio = accumulate_and_measure(xor_ptr);
48694854

4870-
const ULONG64 dtsc2 = aftertsc2 - beforetsc2;
4871-
const ULONG64 dtq2 = afterqit2 - beforeqit2;
4872-
const ULONG64 secondRatio = (dtq2 != 0) ? (dtsc2 / dtq2) : 0ULL;
4855+
VMAWARE_UNUSED(dummy);
48734856

48744857
/* branchless absolute difference is like:
48754858
mask = -(uint64_t)(firstRatio < secondRatio) -> 0 or 0xFFFFFFFFFFFFFFFF
@@ -4895,9 +4878,8 @@ struct VM {
48954878
// contrary to what someone could think, under heavy load the ratio will be more close to 0, it will also be closer to 0 if we assign CPUs to a VM in our host machine
48964879
// it will increase if the BIOS/UEFI is configured to run the TSC by "core usage", which is why we use this threshold check based on a lot of empirical data
48974880
// it increases because the CPUID instruction forces the CPU pipeline to drain and serialize (heavy workload), while the XOR loop is a tight arithmetic loop (throughput workload).
4898-
// CPUs will boost to different frequencies for these two scenarios (for example 4.2GHz for XOR vs 4.0GHz for CPUID)
4881+
// CPUs will boost to different frequencies for these two scenarios
48994882
// A difference of 5-10% in ratio (15-30 points) or even more is normal behavior on bare metal
4900-
// lastly, we might see a small ratio always depending on which part of the tick we exactly start the measurement, which is the most important reason why we need a threshold
49014883
if (difference >= 100) {
49024884
debug("TIMER: An hypervisor has been detected intercepting TSC");
49034885
return true; // both ratios will always differ if TSC is downscaled, since the hypervisor can't account for the XOR/NOP loop
@@ -4919,7 +4901,7 @@ struct VM {
49194901
_mm_lfence();
49204902

49214903
// read start time
4922-
u64 t1 = __rdtsc();
4904+
const u64 t1 = __rdtsc();
49234905

49244906
// prevent the compiler from moving the __cpuid call before the t1 read
49254907
COMPILER_BARRIER();
@@ -4929,7 +4911,7 @@ struct VM {
49294911
COMPILER_BARRIER();
49304912

49314913
// the idea is to let rdtscp internally wait until cpuid is executed rather than using another memory barrier
4932-
u64 t2 = __rdtscp(&aux);
4914+
const u64 t2 = __rdtscp(&aux);
49334915

49344916
// ensure the read of t2 doesn't bleed into future instructions
49354917
_mm_lfence();
@@ -4949,7 +4931,7 @@ struct VM {
49494931
volatile unsigned int a, b, c, d;
49504932

49514933
// this differs from the code above because a, b, c and d are effectively "used"
4952-
// because the compiler must honor the write to a volatile variable.
4934+
// the compiler must honor the write to a volatile variable
49534935
asm volatile("cpuid"
49544936
: "=a"(a), "=b"(b), "=c"(c), "=d"(d)
49554937
: "a"(leaf)
@@ -4960,8 +4942,8 @@ struct VM {
49604942
asm volatile("rdtscp" : "=a"(lo2), "=d"(hi2) :: "rcx", "memory");
49614943
asm volatile("lfence" ::: "memory");
49624944

4963-
u64 t1 = (u64(hi1) << 32) | lo1;
4964-
u64 t2 = (u64(hi2) << 32) | lo2;
4945+
const u64 t1 = (u64(hi1) << 32) | lo1;
4946+
const u64 t2 = (u64(hi2) << 32) | lo2;
49654947

49664948
return t2 - t1;
49674949
#endif
@@ -5121,7 +5103,7 @@ struct VM {
51215103
// pre-allocate sample buffer and touch pages to avoid page faults by MMU during measurement
51225104
std::vector<u64> samples;
51235105
samples.resize(n_leaves * iterations);
5124-
for (size_t i = 0; i < samples.size(); ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset)
5106+
for (size_t i = 0; i < samples.size(); ++i) samples[i] = 0; // or RtlSecureZeroMemory (memset) if Windows
51255107

51265108
/*
51275109
* We want to move our thread from the Running state to the Waiting state
@@ -5166,7 +5148,7 @@ struct VM {
51665148
return true;
51675149
}
51685150
else if (cpuid_latency <= 25) {
5169-
// cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios
5151+
// cpuid is fully serializing, no CPU have this low average cycles in real-world scenarios
51705152
// however, in patches, zero or even negative deltas can be seen oftenly
51715153
return true;
51725154
}

0 commit comments

Comments
 (0)