feat: global TSC shaving detection via IPC

Requiem · Requiem · commit 03bd442e860b · 2026-03-04T21:10:15.000+01:00
diff --git a/src/vmaware.hpp b/src/vmaware.hpp
@@ -5285,7 +5285,7 @@ struct VM {
             // const size_t frac_win = (N * 8 + 99) / 100; // ceil(N * 0.08)
             // const size_t win = std::min(N, std::max(MIN_WIN, frac_win));
             const size_t MIN_WIN = 10;
-            // Manual min/max calculation for win size
+            // manual min/max calculation for win size
             const size_t calc_frac = static_cast<size_t>(std::ceil(static_cast<double>(N) * 0.08));
             const size_t inner_max = (MIN_WIN > calc_frac) ? MIN_WIN : calc_frac;
             const size_t win = (N < inner_max) ? N : inner_max;
@@ -5311,7 +5311,7 @@ struct VM {
                 if (static_cast<long double>(new_span) <= EXPAND_FACTOR * static_cast<long double>(best_span) ||
                     (s[cluster_hi - 1] <= (s[cluster_lo - 1] + static_cast<u64>(std::ceil(3.0L * sigma))))) {
                     --cluster_lo;
-                    // Manual min calculation
+                    // manual min calculation
                     best_span = (best_span < new_span) ? best_span : new_span;
                 }
                 else break;
@@ -5322,7 +5322,6 @@ struct VM {
                 if (static_cast<long double>(new_span) <= EXPAND_FACTOR * static_cast<long double>(best_span) ||
                     (s[cluster_hi] <= (s[cluster_lo] + static_cast<u64>(std::ceil(3.0L * sigma))))) {
                     ++cluster_hi;
-                    // Manual min calculation
                     best_span = (best_span < new_span) ? best_span : new_span;
                 }
                 else break;
@@ -5333,8 +5332,7 @@ struct VM {
             // cluster must be reasonably dense and cover a non-negligible portion of samples, so this is pure sanity checks
             const double fraction_in_cluster = static_cast<double>(cluster_size) / static_cast<double>(N);
 
-            // Manual min/max calculation for MIN_CLUSTER
-            // Original: std::min(static_cast<size_t>(std::max<int>(5, static_cast<int>(N / 50))), N);
+            // min/max calculation for MIN_CLUSTER
             const int val_n_50 = static_cast<int>(N / 50);
             const size_t val_max = static_cast<size_t>((5 > val_n_50) ? 5 : val_n_50);
             const size_t MIN_CLUSTER = (val_max < N) ? val_max : N; // at least 2% or 5 elements
@@ -5477,10 +5475,12 @@ struct VM {
         std::vector<u64> used;
         for (u64 s : samples) if (s != 0) used.push_back(s);
         const u64 cpuid_latency = calculate_latency(used);
+        const double cycles_per_iter = static_cast<double>(t1_delta) / static_cast<double>(ITER_XOR);
 
         debug("TIMER: T1 delta:     ", t1_delta);
         debug("TIMER: T2 delta:     ", t2_delta);
         debug("TIMER: VMEXIT latency:    ", cpuid_latency);
+        debug("TIMER: IPC: ", cycles_per_iter);
 
         if (cpuid_latency >= cycle_threshold) {
             debug("TIMER: Detected a vmexit on CPUID");
@@ -5493,74 +5493,47 @@ struct VM {
             return true;
         }
 
-        // ========================== LOCAL RATIO ==========================
-
         // Within the same run, does Thread 2 see a smaller TSC delta than Thread 1?
         // If so, a hypervisor downscaled TSC in the core where exits were occurring to hide vmexit latency
         // while in the other core where no exits occurred, no TSC cycles were decreased, thus thread 2 ran faster than thread 1
         // this logic can be bypassed if the hypervisor downscales TSC in both cores, and that's precisely why we do now a Global Ratio
         const double local_ratio = double(t2_delta) / double(t1_delta);
 
         if (local_ratio < 0.95 || local_ratio > 1.05) {
-            debug("TIMER: Detected a hypervisor intercepting TSC: ", local_ratio, "");
+            debug("TIMER: Detected a hypervisor intercepting TSC locally: ", local_ratio, "");
             return true;
         }
 
-    #if (WINDOWS)
-        typedef struct _PROCESSOR_POWER_INFORMATION {
-            u32 Number;
-            u32 MaxMhz;
-            u32 CurrentMhz;
-            u32 MhzLimit;
-            u32 MaxIdleState;
-            u32 CurrentIdleState;
-        } PROCESSOR_POWER_INFORMATION, * PPROCESSOR_POWER_INFORMATION;
-
-        enum POWER_INFORMATION_LEVEL_MIN {
-            ProcessorInformation = 11
-        };
-
-        const HMODULE hPowr = LoadLibraryA("powrprof.dll");
-        if (!hPowr) return 0;
-
-        const char* names[] = { "CallNtPowerInformation" };
-        void* funcs[1] = { nullptr };
-        util::get_function_address(hPowr, names, funcs, 1);
-        if (!funcs[0]) return 0;
-
-        using CallNtPowerInformation_t = NTSTATUS(__stdcall*)(int, PVOID, ULONG, PVOID, ULONG);
-        CallNtPowerInformation_t CallNtPowerInformation =
-            reinterpret_cast<CallNtPowerInformation_t>(funcs[0]);
-
-        SYSTEM_INFO si;
-        GetSystemInfo(&si);
-        const DWORD procCount = si.dwNumberOfProcessors;
-        if (procCount == 0) return 0;
+        // To calculate the global ratio, we calculate the TSC cycles consumed per iteration of the Thread 1 workload
+        // Thread 1 ran this dependency chain, x ^= i; x = (x << 1) ^ (x >> 3)
+        // because each instruction depends on the result of the previous one, the CPU cannot execute these in parallel
+        // on bare metal, a dependent ALU operation takes a minimum number of core cycles, for this is typically 2-4 per iteration
+        // Even with an extreme 6.0GHz Turbo on a 2.0GHz Base Clock (3x ratio), this results in at least 0.7 to 1.0 TSC cycles per iteration
 
-        const SIZE_T bufSize = static_cast<SIZE_T>(procCount) * sizeof(PROCESSOR_POWER_INFORMATION);
-        void* raw = _malloca(bufSize);
-        if (!raw) return 0;
-        memset(raw, 0, bufSize);
+        // If a hypervisor is "shaving" cycles globally to hide the latency of Thread 2's CPUID spam,
+        // it subtracts time from the global counter. This causes Thread 1 (which caused no exits)
+        // to appear to have finished its work instantly or impossibly fast. 
+        // If thread 1 reports finishing 100000000 dependent iterations in only 10000000 TSC cycles (so 0.1 cycles/iter), the CPU effectively ran at 10 instructions per cycle on a dependent chain 
+        // which is basically impossible on x86 silicon and confirms the TSC was manipulated
 
-        const NTSTATUS status = CallNtPowerInformation(
-            ProcessorInformation,
-            nullptr, 0,
-            raw, static_cast<ULONG>(bufSize)
-        );
-
-        unsigned speed = 0;
-        if ((LONG)status >= 0) {
-            PROCESSOR_POWER_INFORMATION* info = reinterpret_cast<PROCESSOR_POWER_INFORMATION*>(raw);
-            speed = static_cast<unsigned>(info[0].CurrentMhz);
+        // 0.25 is an extremely conservative threshold but there's no need to raise it
+        // because mathematically talking, considering the number of times cpuid is called in thread 2, they will pass this limit even with a patch that downscales only 100 cycles per cpuid call
+        // a value this low implies the code ran 4x faster than the theoretical limit of the silicon,
+        // or roughly 12x faster than the actual core clock speed
+       
+        // This is immune to turbo/throttling noise because those variances (10-30%) are negligible 
+        // compared to the massive reduction (like 90%+) caused by exit hiding
+        if (cycles_per_iter < 0.25) {
+            debug("TIMER: Detected a hypervisor dowscaling TSC globally (IPC was impossible): ", cycles_per_iter);
+            return true;
         }
 
-        _freea(raw);
-
-        if (speed < 800) {
-            debug("TIMER: Detected a hook in rdtsc, frequency was: ", speed);
+        // so if the patch substracted too much TSC cycles, the shaving might result in a near-zero or negative delta
+        // (handled by u64 overflow usually making it huge, but good shaves clamp to 0 or 1)
+        if (t1_delta < 1000) {
+            debug("TIMER: Detected a hypervisor downscaling TSC globally (time was stopped): ", t1_delta);
             return true;
         }
-    #endif
 #endif
         return false;
     }