Skip to content

Commit b34ce23

Browse files
authored
Merge pull request #624 from NotRequiem/main
direct guest TSC manipulation checks
2 parents d849e13 + fe418a7 commit b34ce23

1 file changed

Lines changed: 82 additions & 39 deletions

File tree

src/vmaware.hpp

Lines changed: 82 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4602,7 +4602,7 @@ struct VM {
46024602
// will be used in cpuid measurements later
46034603
u16 cycle_threshold = 800;
46044604
if (util::hyper_x() == HYPERV_ARTIFACT_VM) {
4605-
cycle_threshold = 3500; // if we're running under Hyper-V, make VMAware detect nested virtualization
4605+
cycle_threshold = 3250; // if we're running under Hyper-V, make VMAware detect nested virtualization
46064606
}
46074607

46084608
#if (WINDOWS)
@@ -4767,12 +4767,12 @@ struct VM {
47674767
}
47684768
}
47694769

4770-
// RDTSC trap detection
4771-
// This detection uses two clocks and two loops, a loop that the hypervisor can spoof and a loop that the hypervisor cannot
4772-
// When RDTSC is hooked, the hypervisor usually "downscales" the result to hide the time passed or doesnt let TSC advance for the time it was vm-exiting
4770+
/* TSC offseting detection */
4771+
// This detection uses two clocks and two loops, a loop and a timer that the hypervisor can spoof and a second loop/timer that the hypervisor cannot
4772+
// When the TSC is "hooked", the hypervisor usually downscales the result to hide the time passed or doesnt let TSC advance for the time it was vm-exiting
47734773
// However, the hypervisor have absolutely no way to downscale time for the second loop because it runs natively on the CPU without exiting
4774-
// This creates a discrepancy in the ratio of both loops
4775-
// The hypervisor cannot easily rewind the system wall clock (second loop, QIT/KUSER_SHARED_DATA) without causing system instability (network timeouts, audio lag)
4774+
// This creates a massive discrepancy in the ratio of both loops, contrary to the very small ratio if both timers were to run normally
4775+
// The hypervisor cannot easily rewind the system wall clock (second loop, QIT/KUSER_SHARED_DATA) without causing system instability (network timeouts, audio lag, etc)
47764776
static thread_local volatile u64 g_sink = 0; // thread_local volatile so that it doesnt need to be captured by the lambda
47774777

47784778
// First we start by randomizing counts WITHOUT syscalls and WITHOUT using instructions that can be trapped by hypervisors, this was a hard task
@@ -4833,7 +4833,7 @@ struct VM {
48334833
};
48344834

48354835
// Use rejection sampling as before to avoid modulo bias
4836-
auto generate_iteration_value = [](ULONG64 min, ULONG64 max, auto getrand) noexcept -> ULONG64 {
4836+
auto rng = [](ULONG64 min, ULONG64 max, auto getrand) noexcept -> ULONG64 {
48374837
const ULONG64 range = max - min + 1;
48384838
const ULONG64 limit = (~0ULL) - ((~0ULL) % range);
48394839
for (;;) {
@@ -4848,17 +4848,26 @@ struct VM {
48484848
};
48494849

48504850
const entropy_provider entropyProv{};
4851-
const ULONG64 count_first = generate_iteration_value(30000000ULL, 40000000ULL, [&entropyProv]() noexcept { return entropyProv(); });
4852-
const ULONG64 count_second = generate_iteration_value(300000000ULL, 400000000ULL, [&entropyProv]() noexcept { return entropyProv(); });
4853-
4854-
auto rd_lambda = []() noexcept -> u64 {
4855-
u64 v = __rdtsc();
4856-
g_sink ^= v;
4857-
return v;
4851+
// QueryInterruptTime uses 100-nanosecond units, but the value in KUSER_SHARED_DATA is only updated every 15.625ms (the default system timer tick)
4852+
// For example, 10000000 iterations at @ 3GHz is around 33ms
4853+
// 100000 iterations of XOR would run too quickly (0.03ms), we need at least one tick, so we put 60000000 (20-30ms avg)
4854+
// 100000 iterations of CPUID takes roughly 100ms-200ms (which is safe for QIT)
4855+
// However, we need both loops to execute for roughly the same Wall Clock time (~60-80ms) to ensure CPU Frequency (Turbo) remains consistent
4856+
// 1. CPUID Loop: 130000 iterations * 2000 cycles = 260M cycles
4857+
// 2. XOR Loop: 260M cycles / 18 cycles per iter (volatile overhead) = 14.5M iterations
4858+
// the goal is to ensure we pass the QIT 15.6ms resolution update threshold from usermode while minimizing thermal frequency drift
4859+
const ULONG64 count_first = rng(130000ULL, 135000ULL, [&entropyProv]() noexcept { return entropyProv(); });
4860+
const ULONG64 count_second = rng(14000000ULL, 15000000ULL, [&entropyProv]() noexcept { return entropyProv(); });
4861+
4862+
// the reason why we use CPUID rather than RDTSC is because RDTSC is a conditionally exiting instruction, and you can modify the guest TSC without trapping it
4863+
auto vm_exit = []() noexcept -> u64 {
4864+
volatile int regs[4] = { 0 }; // doesn't need to be as elaborated as the next cpuid_lambda we will use to calculate the real latency
4865+
__cpuid((int*)regs, 0); // unconditional vmexit
4866+
return (u64)regs[0]; // dependency to avoid /O2 builds, so that the CPU cannot start the next iteration of the loop until the current __cpuid writes to regs
48584867
};
48594868

48604869
auto xor_lambda = []() noexcept -> u64 {
4861-
volatile u64 a = 0xDEADBEEFDEADBEEFull; // can be replaced by NOPs, the core idea is to use a non-trappable instruction that the hv cannot virtualize
4870+
volatile u64 a = 0xDEADBEEFDEADBEEFull; // can be replaced with NOPs, etc, the core idea is to use a non-trappable instruction that the hv cannot virtualize
48624871
volatile u64 b = 0x1234567890ABCDEFull;
48634872
u64 v = a ^ b;
48644873
g_sink ^= v;
@@ -4868,17 +4877,38 @@ struct VM {
48684877
using fn_t = u64(*)();
48694878

48704879
// make the pointer volatile so the compiler treats the call as opaque/indirect
4871-
volatile fn_t rd_ptr = +rd_lambda; // +lambda forces conversion to function ptr, so it won't be inlined, we need to prevent the compiler from inlining this
4880+
volatile fn_t cp_ptr = +vm_exit; // +lambda forces conversion to function ptr, so it won't be inlined, we need to prevent the compiler from inlining this
48724881
volatile fn_t xor_ptr = +xor_lambda;
4882+
volatile u64 dummy = 0;
4883+
4884+
// run the XOR loop briefly to force CPU out of sleep states/lower frequencies
4885+
// This reduces the variance (jitter) between the two measurement loops
4886+
// and confuses hypervisors targetting this check who might try to advance TSC when XOR might be running
4887+
for (ULONG64 x = 0; x < 10000000; ++x) {
4888+
dummy += xor_ptr();
4889+
}
48734890

48744891
// first measurement
48754892
ULONG64 beforeqit = 0;
4876-
QueryInterruptTime(&beforeqit); // never touches RDTSC/RDTSCP or transitions to kernel-mode, just reads from KUSER_SHARED_DATA, reason why we use it
4877-
const ULONG64 beforetsc = __rdtsc();
4893+
// Wait for QIT tick edge to avoid granularity errors
4894+
// if our loop takes 20ms, we might capture one tick (15.6ms reported) or two ticks (31.2ms reported) depending on exactly when we started relative to the system timer interrupt
4895+
// this causes the denominator in our ratio to jump by 50-100%, causing a delta artifact of exactly 32 (that would still be too small for the ratio diff to trigger, but anyways)
4896+
// syncing ensures we always start the measurement at the exact edge of a QIT update, eliminating this jitter
4897+
{
4898+
ULONG64 start_wait, now_wait;
4899+
QueryInterruptTime(&start_wait);
4900+
do {
4901+
_mm_pause(); // hint to CPU we-re spin-waiting
4902+
QueryInterruptTime(&now_wait); // never touches RDTSC/RDTSCP or transitions to kernel-mode, just reads from KUSER_SHARED_DATA, reason why we use it
4903+
} while (now_wait == start_wait);
4904+
beforeqit = now_wait;
4905+
}
4906+
// using only one rdtsc call here is harmless
4907+
// and it is intentional instead of manually computing with the frequency we got before (to avoid spoofing)
4908+
const ULONG64 beforetsc = __rdtsc();
48784909

4879-
volatile u64 dummy = 0;
48804910
for (ULONG64 x = 0; x < count_first; ++x) {
4881-
dummy = rd_ptr(); // this loop will be intercepted by a RDTSC trap, downscaling our TSC
4911+
dummy += cp_ptr(); // this loop will be intercepted by a RDTSC trap, downscaling our TSC
48824912
}
48834913

48844914
// the kernel routine that backs up this api runs at CLOCK_LEVEL(13), only preempted by IPI, POWER_LEVEL and NMIs
@@ -4893,11 +4923,20 @@ struct VM {
48934923

48944924
// second measurement
48954925
ULONG64 beforeqit2 = 0;
4896-
QueryInterruptTime(&beforeqit2);
4926+
// wait for QIT tick edge for the second measurement as well
4927+
{
4928+
ULONG64 start_wait, now_wait;
4929+
QueryInterruptTime(&start_wait);
4930+
do {
4931+
_mm_pause();
4932+
QueryInterruptTime(&now_wait);
4933+
} while (now_wait == start_wait);
4934+
beforeqit2 = now_wait;
4935+
}
48974936
const ULONG64 beforetsc2 = __rdtsc();
48984937

48994938
for (ULONG64 x = 0; x < count_second; ++x) {
4900-
dummy = xor_ptr(); // this loop won't be intercepted, it never switches to kernel-mode
4939+
dummy += xor_ptr(); // this loop won't be intercepted, it never switches to kernel-mode
49014940
}
49024941
VMAWARE_UNUSED(dummy);
49034942

@@ -4911,14 +4950,14 @@ struct VM {
49114950

49124951
/* branchless absolute difference is like:
49134952
mask = -(uint64_t)(firstRatio < secondRatio) -> 0 or 0xFFFFFFFFFFFFFFFF
4914-
diff = firstRatio - secondRatio
4915-
abs = (diff ^ mask) - mask
4953+
diff = firstRatio - secondRatio
4954+
abs = (diff ^ mask) - mask
49164955
*/
49174956
const ULONG64 diffMask = (ULONG64)0 - (ULONG64)(firstRatio < secondRatio); // all-ones if first<second, else 0
49184957
const ULONG64 diff = firstRatio - secondRatio; // unsigned subtraction
49194958
const ULONG64 difference = (diff ^ diffMask) - diffMask; // absolute difference, unsigned
49204959

4921-
debug("TIMER: RDTSC -> ", firstRatio, ", QIT -> ", secondRatio, ", Ratio: ", difference);
4960+
debug("TIMER: TSC -> ", firstRatio, ", Interrupt -> ", secondRatio, ", Ratio: ", difference);
49224961

49234962
if (prevMask != 0) {
49244963
pNtSetInformationThread(
@@ -4931,10 +4970,14 @@ struct VM {
49314970

49324971
// QIT is updated in intervals of 100 nanoseconds
49334972
// contrary to what someone could think, under heavy load the ratio will be more close to 0, it will also be closer to 0 if we assign CPUs to a VM in our host machine
4934-
// it will increase if the BIOS is configured to run the TSC by "core usage", which is why we use a 100 threshold check based on a lot of empirical data
4935-
if (difference > 100) {
4936-
debug("TIMER: An hypervisor has been detected intercepting RDTSC");
4937-
return true; // both ratios will always differ if a RDTSC trap is present, since the hypervisor can't account for the XOR/NOP loop
4973+
// it will increase if the BIOS/UEFI is configured to run the TSC by "core usage", which is why we use this threshold check based on a lot of empirical data
4974+
// it increases because the CPUID instruction forces the CPU pipeline to drain and serialize (heavy workload), while the XOR loop is a tight arithmetic loop (throughput workload).
4975+
// CPUs will boost to different frequencies for these two scenarios (for example 4.2GHz for XOR vs 4.0GHz for CPUID)
4976+
// A difference of 5-10% in ratio (15-30 points) or even more is normal behavior on bare metal
4977+
// lastly, we might see a small ratio always depending on which part of the tick we exactly start the measurement, which is the most important reason why we need a threshold
4978+
if (difference >= 100) {
4979+
debug("TIMER: An hypervisor has been detected intercepting TSC");
4980+
return true; // both ratios will always differ if TSC is downscaled, since the hypervisor can't account for the XOR/NOP loop
49384981
}
49394982
#endif
49404983

@@ -5078,7 +5121,11 @@ struct VM {
50785121
return result;
50795122
};
50805123

5081-
// intel leaves on AMD and viceversa will still work for this probe
5124+
// Intel leaves on an AMD CPU and viceversa will still work for this probe
5125+
// for leafs like 0 that just returns static data, like "AuthenticAMD" or "GenuineIntel", a fast exit path could be made
5126+
// for other leaves like the extended state that rely on dynamic system states like APIC IDs and XState, kernel data locks are required
5127+
// we try different leaves so that is not worth to just create a "fast" exit path, forcing guest TSC manipulation
5128+
// the vmexit itself has a latency of around 800 cycles, combined with the registers save and the cpuid information we require, it costs 1000+ cycles
50825129
constexpr unsigned int leaves[] = {
50835130
0xB, // topology
50845131
0xD, // xsave/xstate
@@ -5090,7 +5137,7 @@ struct VM {
50905137
0x5, // MONITOR/MWAIT
50915138
0x40000000u, // hypervisor range start
50925139
0x80000008u, // extended address limits (amd/intel ext)
5093-
0x0 // fallback to leaf 0 occasionally
5140+
0x0 // fallback to leaf 0 occasionally,th
50945141
};
50955142
constexpr size_t n_leaves = sizeof(leaves) / sizeof(leaves[0]);
50965143

@@ -5128,7 +5175,7 @@ struct VM {
51285175
VMAWARE_UNUSED(tmp);
51295176
}
51305177

5131-
// 100 iterations per leaf, store contiguously per-leaf
5178+
// 100 iterations per leaf, store contiguously per-leaf, so 1100 runs in total
51325179
for (size_t li = 0; li < n_leaves; ++li) {
51335180
const unsigned int leaf = leaves[li];
51345181
for (unsigned i = 0; i < iterations; ++i) {
@@ -5143,7 +5190,9 @@ struct VM {
51435190
if (cpuid_latency >= cycle_threshold) {
51445191
return true;
51455192
}
5146-
else if (cpuid_latency <= 20) { // cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios
5193+
else if (cpuid_latency <= 25) {
5194+
// cpuid is fully serializing, not even old CPUs have this low average cycles in real-world scenarios
5195+
// however, in patches, zero or even negative deltas can be seen oftenly
51475196
return true;
51485197
}
51495198
// TLB flushes or side channel cache attacks are not even tried due to how unreliable they are against stealthy hypervisors
@@ -7432,12 +7481,6 @@ struct VM {
74327481
return core::add(brands::VBOX);
74337482
}
74347483

7435-
std::unique_ptr<std::string> sys_vmware = util::sys_result("ioreg -l | grep -i -c -e \"vmware\"");
7436-
7437-
if (std::stoi(*sys_vmware) > 0) {
7438-
return core::add(brands::VMWARE);
7439-
}
7440-
74417484
return false;
74427485
};
74437486

0 commit comments

Comments
 (0)