Skip to content

Commit 03bd442

Browse files
author
Requiem
committed
feat: global TSC shaving detection via IPC
1 parent 4f4e61a commit 03bd442

1 file changed

Lines changed: 30 additions & 57 deletions

File tree

src/vmaware.hpp

Lines changed: 30 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5285,7 +5285,7 @@ struct VM {
52855285
// const size_t frac_win = (N * 8 + 99) / 100; // ceil(N * 0.08)
52865286
// const size_t win = std::min(N, std::max(MIN_WIN, frac_win));
52875287
const size_t MIN_WIN = 10;
5288-
// Manual min/max calculation for win size
5288+
// manual min/max calculation for win size
52895289
const size_t calc_frac = static_cast<size_t>(std::ceil(static_cast<double>(N) * 0.08));
52905290
const size_t inner_max = (MIN_WIN > calc_frac) ? MIN_WIN : calc_frac;
52915291
const size_t win = (N < inner_max) ? N : inner_max;
@@ -5311,7 +5311,7 @@ struct VM {
53115311
if (static_cast<long double>(new_span) <= EXPAND_FACTOR * static_cast<long double>(best_span) ||
53125312
(s[cluster_hi - 1] <= (s[cluster_lo - 1] + static_cast<u64>(std::ceil(3.0L * sigma))))) {
53135313
--cluster_lo;
5314-
// Manual min calculation
5314+
// manual min calculation
53155315
best_span = (best_span < new_span) ? best_span : new_span;
53165316
}
53175317
else break;
@@ -5322,7 +5322,6 @@ struct VM {
53225322
if (static_cast<long double>(new_span) <= EXPAND_FACTOR * static_cast<long double>(best_span) ||
53235323
(s[cluster_hi] <= (s[cluster_lo] + static_cast<u64>(std::ceil(3.0L * sigma))))) {
53245324
++cluster_hi;
5325-
// Manual min calculation
53265325
best_span = (best_span < new_span) ? best_span : new_span;
53275326
}
53285327
else break;
@@ -5333,8 +5332,7 @@ struct VM {
53335332
// cluster must be reasonably dense and cover a non-negligible portion of samples, so this is pure sanity checks
53345333
const double fraction_in_cluster = static_cast<double>(cluster_size) / static_cast<double>(N);
53355334

5336-
// Manual min/max calculation for MIN_CLUSTER
5337-
// Original: std::min(static_cast<size_t>(std::max<int>(5, static_cast<int>(N / 50))), N);
5335+
// min/max calculation for MIN_CLUSTER
53385336
const int val_n_50 = static_cast<int>(N / 50);
53395337
const size_t val_max = static_cast<size_t>((5 > val_n_50) ? 5 : val_n_50);
53405338
const size_t MIN_CLUSTER = (val_max < N) ? val_max : N; // at least 2% or 5 elements
@@ -5477,10 +5475,12 @@ struct VM {
54775475
std::vector<u64> used;
54785476
for (u64 s : samples) if (s != 0) used.push_back(s);
54795477
const u64 cpuid_latency = calculate_latency(used);
5478+
const double cycles_per_iter = static_cast<double>(t1_delta) / static_cast<double>(ITER_XOR);
54805479

54815480
debug("TIMER: T1 delta: ", t1_delta);
54825481
debug("TIMER: T2 delta: ", t2_delta);
54835482
debug("TIMER: VMEXIT latency: ", cpuid_latency);
5483+
debug("TIMER: IPC: ", cycles_per_iter);
54845484

54855485
if (cpuid_latency >= cycle_threshold) {
54865486
debug("TIMER: Detected a vmexit on CPUID");
@@ -5493,74 +5493,47 @@ struct VM {
54935493
return true;
54945494
}
54955495

5496-
// ========================== LOCAL RATIO ==========================
5497-
54985496
// Within the same run, does Thread 2 see a smaller TSC delta than Thread 1?
54995497
// If so, a hypervisor downscaled TSC in the core where exits were occurring to hide vmexit latency
55005498
// while in the other core where no exits occurred, no TSC cycles were decreased, thus thread 2 ran faster than thread 1
55015499
// this logic can be bypassed if the hypervisor downscales TSC in both cores, and that's precisely why we do now a Global Ratio
55025500
const double local_ratio = double(t2_delta) / double(t1_delta);
55035501

55045502
if (local_ratio < 0.95 || local_ratio > 1.05) {
5505-
debug("TIMER: Detected a hypervisor intercepting TSC: ", local_ratio, "");
5503+
debug("TIMER: Detected a hypervisor intercepting TSC locally: ", local_ratio, "");
55065504
return true;
55075505
}
55085506

5509-
#if (WINDOWS)
5510-
typedef struct _PROCESSOR_POWER_INFORMATION {
5511-
u32 Number;
5512-
u32 MaxMhz;
5513-
u32 CurrentMhz;
5514-
u32 MhzLimit;
5515-
u32 MaxIdleState;
5516-
u32 CurrentIdleState;
5517-
} PROCESSOR_POWER_INFORMATION, * PPROCESSOR_POWER_INFORMATION;
5518-
5519-
enum POWER_INFORMATION_LEVEL_MIN {
5520-
ProcessorInformation = 11
5521-
};
5522-
5523-
const HMODULE hPowr = LoadLibraryA("powrprof.dll");
5524-
if (!hPowr) return 0;
5525-
5526-
const char* names[] = { "CallNtPowerInformation" };
5527-
void* funcs[1] = { nullptr };
5528-
util::get_function_address(hPowr, names, funcs, 1);
5529-
if (!funcs[0]) return 0;
5530-
5531-
using CallNtPowerInformation_t = NTSTATUS(__stdcall*)(int, PVOID, ULONG, PVOID, ULONG);
5532-
CallNtPowerInformation_t CallNtPowerInformation =
5533-
reinterpret_cast<CallNtPowerInformation_t>(funcs[0]);
5534-
5535-
SYSTEM_INFO si;
5536-
GetSystemInfo(&si);
5537-
const DWORD procCount = si.dwNumberOfProcessors;
5538-
if (procCount == 0) return 0;
5507+
// To calculate the global ratio, we calculate the TSC cycles consumed per iteration of the Thread 1 workload
5508+
// Thread 1 ran this dependency chain, x ^= i; x = (x << 1) ^ (x >> 3)
5509+
// because each instruction depends on the result of the previous one, the CPU cannot execute these in parallel
5510+
// on bare metal, a dependent ALU operation takes a minimum number of core cycles, for this is typically 2-4 per iteration
5511+
// Even with an extreme 6.0GHz Turbo on a 2.0GHz Base Clock (3x ratio), this results in at least 0.7 to 1.0 TSC cycles per iteration
55395512

5540-
const SIZE_T bufSize = static_cast<SIZE_T>(procCount) * sizeof(PROCESSOR_POWER_INFORMATION);
5541-
void* raw = _malloca(bufSize);
5542-
if (!raw) return 0;
5543-
memset(raw, 0, bufSize);
5513+
// If a hypervisor is "shaving" cycles globally to hide the latency of Thread 2's CPUID spam,
5514+
// it subtracts time from the global counter. This causes Thread 1 (which caused no exits)
5515+
// to appear to have finished its work instantly or impossibly fast.
5516+
// If thread 1 reports finishing 100000000 dependent iterations in only 10000000 TSC cycles (so 0.1 cycles/iter), the CPU effectively ran at 10 instructions per cycle on a dependent chain
5517+
// which is basically impossible on x86 silicon and confirms the TSC was manipulated
55445518

5545-
const NTSTATUS status = CallNtPowerInformation(
5546-
ProcessorInformation,
5547-
nullptr, 0,
5548-
raw, static_cast<ULONG>(bufSize)
5549-
);
5550-
5551-
unsigned speed = 0;
5552-
if ((LONG)status >= 0) {
5553-
PROCESSOR_POWER_INFORMATION* info = reinterpret_cast<PROCESSOR_POWER_INFORMATION*>(raw);
5554-
speed = static_cast<unsigned>(info[0].CurrentMhz);
5519+
// 0.25 is an extremely conservative threshold but there's no need to raise it
5520+
// because mathematically talking, considering the number of times cpuid is called in thread 2, they will pass this limit even with a patch that downscales only 100 cycles per cpuid call
5521+
// a value this low implies the code ran 4x faster than the theoretical limit of the silicon,
5522+
// or roughly 12x faster than the actual core clock speed
5523+
5524+
// This is immune to turbo/throttling noise because those variances (10-30%) are negligible
5525+
// compared to the massive reduction (like 90%+) caused by exit hiding
5526+
if (cycles_per_iter < 0.25) {
5527+
debug("TIMER: Detected a hypervisor dowscaling TSC globally (IPC was impossible): ", cycles_per_iter);
5528+
return true;
55555529
}
55565530

5557-
_freea(raw);
5558-
5559-
if (speed < 800) {
5560-
debug("TIMER: Detected a hook in rdtsc, frequency was: ", speed);
5531+
// so if the patch substracted too much TSC cycles, the shaving might result in a near-zero or negative delta
5532+
// (handled by u64 overflow usually making it huge, but good shaves clamp to 0 or 1)
5533+
if (t1_delta < 1000) {
5534+
debug("TIMER: Detected a hypervisor downscaling TSC globally (time was stopped): ", t1_delta);
55615535
return true;
55625536
}
5563-
#endif
55645537
#endif
55655538
return false;
55665539
}

0 commit comments

Comments
 (0)