From a08f6093389c2cb751202f241a7cc3e1ffc75d31 Mon Sep 17 00:00:00 2001 From: Quang Trinh Date: Fri, 24 Apr 2026 04:05:56 +0700 Subject: [PATCH 1/3] perf(event_dispatcher): lock-free emit via atomic shared_ptr snapshot Replace shared_mutex reader lock with std::atomic> copy-on-write snapshot for emit()/emit_safe(), plus an atomic handler counter fast path for the zero-subscriber case. Add DetourModKit_bench microbench (DMK_BUILD_BENCHMARKS) and three new tests covering the fast path, snapshot stability, and reclamation. Include loader-lock destructor hardening in HookManager to keep move-only VmtHookEntry values out of container copy fallbacks. CI: add MSVC verify job to coverage-pages.yml, gate pages deploy on both MinGW and MSVC passing. --- .github/workflows/coverage-pages.yml | 45 +++- AGENTS.md | 6 +- CMakeLists.txt | 17 +- README.md | 6 +- .../event_dispatcher_bench_v3.2.0/README.md | 106 ++++++++ .../event_dispatcher_bench_v3.2.0/after.tsv | 13 + .../event_dispatcher_bench_v3.2.0/before.tsv | 13 + docs/tests/README.md | 37 ++- include/DetourModKit/event_dispatcher.hpp | 201 ++++++++++---- src/hook_manager.cpp | 30 ++- tests/CMakeLists.txt | 207 ++++++++------- tests/bench_event_dispatcher.cpp | 251 ++++++++++++++++++ tests/test_event_dispatcher.cpp | 131 +++++++++ tests/test_hook_manager.cpp | 26 ++ 14 files changed, 925 insertions(+), 164 deletions(-) create mode 100644 docs/analysis/event_dispatcher_bench_v3.2.0/README.md create mode 100644 docs/analysis/event_dispatcher_bench_v3.2.0/after.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.2.0/before.tsv create mode 100644 tests/bench_event_dispatcher.cpp diff --git a/.github/workflows/coverage-pages.yml b/.github/workflows/coverage-pages.yml index a25f815..6bc6712 100644 --- a/.github/workflows/coverage-pages.yml +++ b/.github/workflows/coverage-pages.yml @@ -27,14 +27,10 @@ env: MINGW_BIN: C:\mingw64\bin jobs: - build-and-deploy: - name: Build, Test & Deploy Coverage + mingw-coverage: + name: MinGW Build, Test & Coverage Artifact runs-on: windows-latest - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - steps: - name: Checkout code uses: actions/checkout@v4 @@ -102,6 +98,43 @@ jobs: with: path: coverage-report + msvc-verify: + name: MSVC Build & Test + runs-on: windows-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: "recursive" + + - name: Set up MSVC developer environment + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + + - name: Configure (MSVC Debug + Tests) + run: cmake --preset msvc-debug + shell: cmd + + - name: Build + run: cmake --build --preset msvc-debug --parallel + shell: cmd + + - name: Run Tests + run: ctest --preset msvc-debug + shell: cmd + + deploy-pages: + name: Deploy Coverage Report + runs-on: ubuntu-latest + needs: [mingw-coverage, msvc-verify] + + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + steps: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v4 diff --git a/AGENTS.md b/AGENTS.md index cd78fae..acbd0ac 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -167,7 +167,7 @@ pending_messages_.fetch_add(1, std::memory_order_acq_rel); ### Resource management and patterns -- **RAII everywhere:** `std::unique_ptr`, `std::shared_ptr`, `std::lock_guard`, `std::scoped_lock`. No naked `new`/`delete` in application code. The only permitted exception is leak-on-purpose singletons to avoid the static destruction order fiasco (must be documented with a comment explaining why). +- **RAII everywhere:** `std::unique_ptr`, `std::shared_ptr`, `std::lock_guard`, `std::scoped_lock`. No naked `new`/`delete` in application code. The only permitted exception is leak-on-purpose state to avoid teardown hazards -- specifically the static destruction order fiasco or deadlock when destruction would run under the Windows loader lock. Any such leak must be documented with a comment explaining why, must use `new (std::nothrow)` so the enclosing `noexcept` path stays honest, and must pin the current module so code pages referenced by the leaked state stay mapped (see `HookManager::~HookManager` and `Logger::shutdown_internal`). - **Rule of Zero/Five:** Prefer Rule of Zero (let compiler generate special members). When custom resource management is needed, implement all five special members. Delete copy/move when the type is non-copyable/non-movable. - **Atomic memory orderings:** Use the weakest correct ordering. `memory_order_relaxed` for counters and non-critical flags. `acquire`/`release` pairs for synchronization. Document why in comments only when the ordering is non-obvious. - **Lock ordering:** When acquiring multiple locks, document the order in the class header and follow it strictly. Example from `logger.hpp`: `1. async_mutex_` then `2. *log_mutex_ptr_`. @@ -253,14 +253,14 @@ PATH="/c/msys64/mingw64/bin:$PATH" ./build/mingw-debug/tests/DetourModKit_tests. | Module | Thread safety | Hot-path mechanism | |--------|--------------|-------------------| | Scanner | Stateless -- inherently safe | N/A (startup only) | -| HookManager | `shared_mutex` (readers) / `unique_lock` (writers); two-phase shutdown (disable under shared lock, clear under exclusive lock); `m_mutator_gate` (shared_mutex) blocks new mutators (including all VMT operations) during teardown; CAS on `m_shutdown_called` serializes shutdown/remove_all_hooks; double-checked fast-fail on `m_shutdown_called` in all mutators; destructor fallback (when `DMK_Shutdown()` was not called) acquires `m_mutator_gate` exclusively, flips `m_shutdown_called`, drains readers via exclusive `m_hooks_mutex`, then clears the maps -- under loader lock it pins the module and leaks the maps into a static vector instead of draining (mirrors the Logger::shutdown_internal pattern) | `shared_lock` for `with_inline_hook()` | +| HookManager | `shared_mutex` (readers) / `unique_lock` (writers); two-phase shutdown (disable under shared lock, clear under exclusive lock); `m_mutator_gate` (shared_mutex) blocks new mutators (including all VMT operations) during teardown; CAS on `m_shutdown_called` serializes shutdown/remove_all_hooks; double-checked fast-fail on `m_shutdown_called` in all mutators; destructor fallback (when `DMK_Shutdown()` was not called) acquires `m_mutator_gate` exclusively, flips `m_shutdown_called`, drains readers via exclusive `m_hooks_mutex`, then clears the maps -- under loader lock it pins the module and move-constructs each map onto the heap via `new (std::nothrow)` so the storage outlives the destructor without ever draining, mirroring the leak-on-loader-lock discipline used in `Logger::shutdown_internal` | `shared_lock` for `with_inline_hook()` | | Logger | `atomic` for lock-free async reads; `shutdown_internal` is safe across repeated shutdown / enable_async_mode cycles: when the writer thread has to be detached under loader lock, the `shared_ptr` is appended to a static `std::vector` rather than overwriting a single static slot, so prior handles are never dropped while their writer threads may still be running | Single atomic load on log level check | | AsyncLogger | Lock-free MPMC queue (Vyukov-style); post-join drain on shutdown (at most one message per producer can be lost in the nanosecond race between drain and force-zero -- accepted trade-off to avoid atomic overhead on every enqueue); timestamp caching in write batches | Atomic sequence numbers per slot | | InputPoller | Atomic `active_states_[]` array | `memory_order_relaxed` load per binding | | InputManager | `mutex` for lifecycle, `atomic` for reads | Lock-free `is_binding_active()` | | Memory cache | Sharded `SRWLOCK` + epoch-based shutdown | Shared reader locks per shard | | Config | `mutex` for registration; deferred setter invocation outside lock (no reentrancy guard needed -- setters may call back into Config) | N/A (startup only) | -| EventDispatcher | `shared_mutex` -- shared lock for `emit()`/`emit_safe()`, exclusive lock for subscribe/unsubscribe; thread-local reentrancy guard rejects subscribe/unsubscribe from within handlers; `emit()` propagates handler exceptions, `emit_safe()` catches and skips them | `shared_lock` + contiguous vector iteration in subscription order | +| EventDispatcher | Lock-free `emit()` / `emit_safe()` via `std::atomic>>` snapshot (copy-on-write publish, acquire-load on read); zero-subscriber fast path skips the snapshot load via an atomic handler counter; writers serialize on a small `std::mutex` that never touches the emit hot path; thread-local reentrancy guard rejects subscribe/unsubscribe from within handlers so the no-mutation-during-emit invariant holds; `emit()` propagates handler exceptions, `emit_safe()` catches and skips them | Atomic acquire-load of a `shared_ptr` snapshot plus linear iteration over a contiguous vector; no reader lock | | Profiler | Lock-free ring buffer via atomic `fetch_add` on write position; odd/even sequence counter per sample slot prevents torn reads during concurrent export -- the sequence is opened and closed with unconditional `fetch_add` (never a load-then-store) so concurrent producers racing on the same slot cannot roll the counter backwards; `DMK_PROFILE_SCOPE(name)` requires `name` to be a string literal, enforced at compile time by a `ScopedProfile` constructor that only binds to `const char (&)[N]` | Single atomic increment + sequence-guarded field writes per sample | ### Performance-critical paths diff --git a/CMakeLists.txt b/CMakeLists.txt index 854c83e..5b8a214 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -467,8 +467,19 @@ endif() # --- Unit Tests --- option(DMK_BUILD_TESTS "Build unit tests" OFF) -if(DMK_BUILD_TESTS) - message(STATUS "Building unit tests...") - enable_testing() +# --- Benchmarks --- +# Opt-in microbenchmark executables (e.g. DetourModKit_bench). Gated so that +# normal consumer builds do not produce extra targets. The bench sources live +# alongside tests/ and are wired up in tests/CMakeLists.txt. +option(DMK_BUILD_BENCHMARKS "Build benchmark executables" OFF) + +if(DMK_BUILD_TESTS OR DMK_BUILD_BENCHMARKS) + if(DMK_BUILD_TESTS) + message(STATUS "Building unit tests...") + enable_testing() + endif() + if(DMK_BUILD_BENCHMARKS) + message(STATUS "Building benchmarks...") + endif() add_subdirectory(tests) endif() diff --git a/README.md b/README.md index 9c322f5..3f18fea 100644 --- a/README.md +++ b/README.md @@ -105,13 +105,15 @@ DetourModKit is a lightweight C++ toolkit designed to simplify common tasks in g - Typed pub/sub event system with RAII subscription management - Each `EventDispatcher` manages a single event type -- `shared_mutex` concurrency: concurrent `emit()` via shared lock, exclusive lock for subscribe/unsubscribe +- Lock-free `emit()` / `emit_safe()`: atomic acquire-load of a `std::shared_ptr` snapshot and iterate, no reader lock; `subscribe()` / `unsubscribe()` are copy-on-write under a small writer mutex +- Zero-subscriber fast path: `emit()` / `emit_safe()` short-circuit on a lock-free atomic handler counter, skipping the snapshot load entirely - Subscriptions auto-unsubscribe on destruction - Handlers invoked in subscription order (preserved across unsubscribe) -- Thread-local reentrancy guard detects and rejects subscribe/unsubscribe calls from within a handler, preventing deadlock +- Thread-local reentrancy guard detects and rejects subscribe/unsubscribe calls from within a handler, keeping the no-mutation-during-emit invariant intact - Compose multiple dispatchers for multi-event architectures - `emit_safe()` for exception-tolerant dispatch (recommended for hook callbacks) - Safe when the dispatcher is destroyed before its subscriptions (weak_ptr guard) +- Trade-off: `subscribe()` / `unsubscribe()` allocate a new handler list each call (O(n) publish). Suited for 1-10 subscribers per event and write-rarely access patterns, which matches typical mod usage diff --git a/docs/analysis/event_dispatcher_bench_v3.2.0/README.md b/docs/analysis/event_dispatcher_bench_v3.2.0/README.md new file mode 100644 index 0000000..c54f18e --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.2.0/README.md @@ -0,0 +1,106 @@ +# EventDispatcher Bench, v3.2.0 + +Before/after numbers for the lock-free COW snapshot `emit()` landed in v3.2.0. +The previous implementation used `std::shared_mutex` for `emit()` / `emit_safe()` +and an exclusive lock for `subscribe()` / `unsubscribe()`. The new implementation +stores handlers in a `std::atomic>>` +snapshot published on mutation, with a lock-free atomic handler-count fast +path for the zero-subscriber case. + +## Results + +| Scenario | Subs | Before (ns/op) | After (ns/op) | Delta | +| --------------------------- | ---: | -------------: | ------------: | ----------------- | +| `emit` | 0 | 105.20 | **6.47** | **-94% (16.3x)** | +| `emit` | 1 | 126.23 | 106.85 | -15% | +| `emit` | 8 | 253.99 | 249.52 | -2% | +| `emit` | 64 | 1121.43 | 1324.66 | +18% (regression) | +| `emit_safe` | 0 | 103.55 | **6.32** | **-94% (16.4x)** | +| `emit_safe` | 1 | 119.27 | 106.76 | -10% | +| `emit_safe` | 8 | 231.13 | 208.92 | -10% | +| `emit_safe` | 64 | 1169.86 | 1077.59 | -8% | +| `subscribe_unsub_roundtrip` | 0 | 487.18 | 1125.23 | +131% (expected) | +| `emit_concurrent_4_threads` | 8 | 551.73 | **268.07** | **-51% (2.06x)** | +| `reentrancy_rejection` | 1 | 239.07 | 202.82 | -15% | + +Raw TSVs in [before.tsv](before.tsv) and [after.tsv](after.tsv). Each row is the +median of 11 samples. Iteration counts vary per row (10M for fast cases down to +200K for the slowest) to keep per-scenario wall time comparable. + +## Interpretation + +**Zero-subscriber fast path.** The atomic handler-count short-circuit in +`emit()` / `emit_safe()` collapses a `shared_mutex` acquire/release plus +iteration setup into a single `memory_order_acquire` load of an 8-byte counter. +The 16x factor is the cost of an uncontended `shared_mutex` acquire/release +on Windows SRWLOCK relative to a naked atomic load, and it is the dominant +result for dispatchers that are wired up at init but rarely subscribed to. + +**1 to 8 subscriber uncontended emit.** Small consistent wins (10% to 15%) +from removing the reader lock. The snapshot load is a release-acquire atomic +plus a `shared_ptr` refcount bump, which is cheaper than touching a mutex's +state word unconditionally. + +**Concurrent emit (4 threads, 8 subs).** 2.06x throughput. No reader lock +means no cache-line contention on the mutex state, so all four threads make +progress in parallel instead of serializing on the SRWLOCK read side. + +**64 subscriber emit, single thread.** 18% slower (+203 ns on a 1121 ns +baseline). Two plausible causes: + +1. Timer noise. On an 1100 ns run, 200 ns is 2-3 cycles worth of timer jitter + amplified across the sample; the noise floor on `steady_clock` is + typically in the tens of nanoseconds per sample. +2. `std::atomic` load cost dominates over the old loop's + single mutex acquire when amortized over only 64 handlers. libstdc++'s + implementation uses DWCAS (cmpxchg16b) on the snapshot atomic; MSVC + uses an internal spinlock. + +Typical DetourModKit usage (per the README: 1-10 subscribers per event, +dispatchers wired once at init) stays well inside the range where the +optimization is a pure win. The 64 subscriber row should be treated as a +worst-case indicator, not representative load. + +**Subscribe / unsubscribe round-trip.** 2.31x slower (487 ns to 1125 ns). +Each mutation allocates a fresh handler vector, appends or removes the +entry, and publishes via atomic store. This is documented in the header +and is the accepted tradeoff for lock-free reads. Subscribe is not on a +hot path in any realistic mod workload. + +**Concurrent emit, reentrancy rejection.** Small wins from the same +fast-path removal of the shared lock. + +## Methodology + +- Host: Windows 11, MinGW `mingw-release` preset (GCC 13, libstdc++, -O3 LTO). +- CMake: `cmake --preset mingw-release -DDMK_BUILD_BENCHMARKS=ON -DDMK_BUILD_TESTS=OFF`. +- Build: `DetourModKit_bench` target only. No gtest linkage, no other test deps. +- Each sample runs N iterations of the scenario inside a single + `steady_clock::now()` pair. Reported value is the median per-op cost across + 11 samples. Iteration counts are chosen so each sample takes roughly the + same wall time. +- Back-to-back runs, same machine, same process start, thermal state + comparable. Numbers are not hermetic; reruns on the same machine drift by + a few percent at this granularity. + +## Reproduce + +```bash +cmake --preset mingw-release -DDMK_BUILD_BENCHMARKS=ON -DDMK_BUILD_TESTS=OFF +PATH="/c/msys64/mingw64/bin:$PATH" cmake --build build/mingw-release --target DetourModKit_bench --parallel +PATH="/c/msys64/mingw64/bin:$PATH" ./build/mingw-release/tests/DetourModKit_bench.exe > after.tsv +``` + +For a clean before/after comparison, bench the new implementation first, +copy the header aside, `git checkout HEAD -- include/DetourModKit/event_dispatcher.hpp` +to restore the baseline header, rebuild the `DetourModKit_bench` target, run +again into `before.tsv`, then restore the new header. + +## Caveat on committed TSVs + +The `before.tsv` and `after.tsv` files in this directory are raw artifacts +from one run on one machine. They are not a stable baseline. Treat them as +evidence for the claims in this document, not as a regression gate. Future +bench runs should regenerate their own numbers and compare against the +structure of the results (16x fast-path win, 2x concurrent win, COW +subscribe cost) rather than the absolute nanosecond values. diff --git a/docs/analysis/event_dispatcher_bench_v3.2.0/after.tsv b/docs/analysis/event_dispatcher_bench_v3.2.0/after.tsv new file mode 100644 index 0000000..995d1d2 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.2.0/after.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 6.47 718 +emit 1 5000000 106.85 5860 +emit 8 1000000 249.52 2756 +emit 64 200000 1324.66 2940 +emit_safe 0 10000000 6.32 696 +emit_safe 1 5000000 106.76 5727 +emit_safe 8 1000000 208.92 2341 +emit_safe 64 200000 1077.59 2401 +subscribe_unsub_roundtrip 0 100000 1125.23 1259 +emit_concurrent_4_threads 8 4000000 268.07 1072 +reentrancy_rejection 1 500000 202.82 1148 +# sink=23940842247 diff --git a/docs/analysis/event_dispatcher_bench_v3.2.0/before.tsv b/docs/analysis/event_dispatcher_bench_v3.2.0/before.tsv new file mode 100644 index 0000000..59b6007 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.2.0/before.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 105.20 11778 +emit 1 5000000 126.23 6977 +emit 8 1000000 253.99 3539 +emit 64 200000 1121.43 2538 +emit_safe 0 10000000 103.55 11537 +emit_safe 1 5000000 119.27 6594 +emit_safe 8 1000000 231.13 2555 +emit_safe 64 200000 1169.86 2613 +subscribe_unsub_roundtrip 0 100000 487.18 545 +emit_concurrent_4_threads 8 4000000 551.73 2206 +reentrancy_rejection 1 500000 239.07 1326 +# sink=24038801584 diff --git a/docs/tests/README.md b/docs/tests/README.md index 056601d..8b65c54 100644 --- a/docs/tests/README.md +++ b/docs/tests/README.md @@ -268,17 +268,50 @@ This is expected when casting `FARPROC` from `GetProcAddress` to a typed functio | 90% | Hard | Integration tests, edge cases in threading | | 95%+ | Very Hard | Requires mocking Windows API or refactoring | +## Event Dispatcher Tests + +`tests/test_event_dispatcher.cpp` exercises the lock-free copy-on-write dispatcher. Beyond the basic subscribe/emit/RAII coverage, three tests target the optimized hot path specifically: + +| Test | What it proves | +| ---- | -------------- | +| `EmptyFastPath_SkipsLock` | With zero subscribers, `emit()` / `emit_safe()` return via the atomic handler-count check without touching the snapshot `shared_ptr`. Asserted by `debug_snapshot_use_count()` remaining at 1 (dispatcher-only) after 1000 emits. | +| `SnapshotStability_DuringEmit` | An in-flight emit holds its own snapshot reference. A concurrent `subscribe()` publishes a new snapshot via CAS; the emitter's iteration continues over the old snapshot and the newly subscribed handler is not invoked. Next emit sees both. Exercises the COW publish invariant. | +| `SnapshotReclamation_NoLeak` | After 10,000 subscribe/unsubscribe churn iterations with interleaved emits, `debug_snapshot_use_count()` returns to 1, proving no leaked `shared_ptr` references to stale snapshots. | + +These tests enable the test-only `debug_snapshot_use_count()` accessor via `#define DMK_EVENT_DISPATCHER_INTERNAL_TESTING 1` at the top of the translation unit. The macro is not part of the public API and must not be defined in consumer code. + +## Benchmark Harness + +`tests/bench_event_dispatcher.cpp` is a standalone microbenchmark executable. It is deliberately not a gtest binary so it can run under any build configuration (release, release+PGO, ASAN, etc.) without dragging in the gtest runtime. + +Build it by adding `-DDMK_BUILD_BENCHMARKS=ON` to the configure step: + +```bash +PATH="/c/msys64/mingw64/bin:$PATH" +cmake --preset mingw-release -DDMK_BUILD_BENCHMARKS=ON +cmake --build build/mingw-release --parallel +./build/mingw-release/tests/DetourModKit_bench.exe > bench.tsv +``` + +The option is independent of `DMK_BUILD_TESTS`, so you can build the bench alone. Output is a tab-separated table on stdout with columns `scenario, subscribers, iterations, median_ns_per_op, total_ms`. Covered scenarios: + +- `emit` / `emit_safe` at 0, 1, 8, 64 subscribers (the 0-subscriber rows measure the fast path). +- `subscribe_unsub_roundtrip` (single-thread RAII churn). +- `emit_concurrent_4_threads` (contention stress on the lock-free read path). +- `reentrancy_rejection` (cost of the guard's reject-during-handler path). + ## Project Structure ```text tests/ -├── CMakeLists.txt # Test discovery, fixture DLL build +├── CMakeLists.txt # Test discovery, fixture DLL build, bench wiring ├── main.cpp # GoogleTest entry point +├── bench_event_dispatcher.cpp # Standalone microbench (DMK_BUILD_BENCHMARKS) ├── fixtures/ │ └── hook_target_lib.cpp # Fixture DLL (exported functions for integration tests) ├── test_async_logger.cpp # Async logger tests ├── test_config.cpp # Configuration tests -├── test_event_dispatcher.cpp # Event dispatcher tests +├── test_event_dispatcher.cpp # Event dispatcher tests (incl. fast-path and snapshot stability) ├── test_filesystem.cpp # Filesystem tests ├── test_format.cpp # Format utilities tests ├── test_hook_integration.cpp # Cross-module hook integration tests diff --git a/include/DetourModKit/event_dispatcher.hpp b/include/DetourModKit/event_dispatcher.hpp index 4b83d0a..14ea33b 100644 --- a/include/DetourModKit/event_dispatcher.hpp +++ b/include/DetourModKit/event_dispatcher.hpp @@ -10,16 +10,27 @@ * automatically unsubscribe on destruction. * * **Threading model:** - * - `emit()` acquires a shared lock (readers do not block each other) - * - `subscribe()` / manual `unsubscribe()` acquire an exclusive lock - * - Safe to emit from multiple threads concurrently (e.g., hook callbacks) - * - Safe to subscribe/unsubscribe from any thread + * - `emit()` / `emit_safe()` are lock-free on the hot path: they + * acquire-load a `std::shared_ptr>` + * snapshot and iterate it. Concurrent emits never contend. + * - `subscribe()` / manual `unsubscribe()` serialize writers through + * a small `std::mutex` and publish a new immutable snapshot via + * copy-on-write. + * - Safe to emit from multiple threads concurrently (e.g., hook callbacks). + * - Safe to subscribe/unsubscribe from any thread. * * **Performance characteristics:** - * - `emit()`: shared_lock + linear iteration over contiguous handler vector - * - `subscribe()`: exclusive_lock + vector push_back (amortized O(1)) - * - `unsubscribe()`: exclusive_lock + erase-remove (O(n) subscribers) - * - No heap allocation on `emit()`. Handler vector is cache-friendly. + * - `emit()`: atomic acquire-load of a `shared_ptr` snapshot, then + * linear iteration over the contiguous handler vector. No mutex + * acquisition on the hot path. When there are no subscribers, + * `emit()` skips the snapshot load entirely via an atomic counter. + * - `subscribe()` / `unsubscribe()`: copy-on-write. Each writer + * allocates a new handler vector (O(n) in the current subscriber + * count), appends or removes an entry, and publishes it atomically. + * Typical dispatcher usage is 1-10 subscribers and write-rarely, + * so the O(n) publish cost is negligible in practice. + * - No heap allocation on `emit()` beyond the `shared_ptr` refcount + * bump. Handler vector is cache-friendly. * * **Usage:** * @code @@ -32,7 +43,7 @@ * logger.info("Health: {}", e.health); * }); * - * // Emit from a hook callback (shared_lock, thread-safe) + * // Emit from a hook callback (lock-free, thread-safe) * dispatcher.emit(PlayerStateChanged{.health = 75.0f}); * @endcode */ @@ -43,7 +54,6 @@ #include #include #include -#include #include #include @@ -107,8 +117,8 @@ namespace DetourModKit * If the Subscription is also destroyed inside the same * handler scope, the destructor's reset() is likewise * skipped because emitting_depth is still positive. - * This prevents deadlock from acquiring an exclusive lock - * inside a shared lock held by emit(). + * This keeps the no-mutation-during-emit invariant intact + * so the in-flight snapshot iteration remains consistent. */ void reset() noexcept { @@ -159,18 +169,30 @@ namespace DetourModKit * @endcode * * **Thread safety:** - * - `emit()`: shared_lock (multiple concurrent emitters OK) - * - `subscribe()` / `unsubscribe()`: exclusive_lock - * - Handlers are invoked under shared_lock. A thread-local reentrancy - * guard detects and rejects subscribe/unsubscribe calls from within - * a handler, returning a default-constructed Subscription instead of - * deadlocking. + * - `emit()` / `emit_safe()`: lock-free. Acquires a `shared_ptr` snapshot + * of the immutable handler list and iterates it. + * - `subscribe()` / `unsubscribe()`: copy-on-write under a small writer + * mutex. Each mutation allocates a new handler vector, appends or + * removes the entry, and publishes the new snapshot atomically. + * - Handlers are invoked while the snapshot's `shared_ptr` keeps the + * vector alive. A thread-local reentrancy guard detects and rejects + * subscribe/unsubscribe calls from within a handler; the guard is what + * guarantees the user's "do not mutate during emit" invariant, not the + * snapshot mechanism. * * **Reentrancy guard scope:** The guard is per-template-instantiation, * not per-instance. Two dispatchers of the same Event type share the * same thread-local counter. Subscribing to a second dispatcher of * the same type from within a handler on the first will be rejected. * Use distinct event types to avoid this (the typical usage pattern). + * + * **Subscribe/emit ordering invariant:** A subscribe() performs a + * release-store on both the snapshot pointer and the atomic handler + * count. Any thread that observes the Subscription object returned + * from subscribe() (or synchronizes-with the thread that did) will + * see the subscription in subsequent emits. Without such a + * happens-before edge, a concurrent emit may or may not observe a + * freshly-published handler -- this matches the user's own ordering. */ template class EventDispatcher @@ -179,8 +201,22 @@ namespace DetourModKit /// Handler function signature: receives the event by const reference. using Handler = std::function; + private: + // Private type aliases surfaced here so they are visible to the + // public API's member declarations and constructor below. + struct Entry + { + SubscriptionId id; + Handler callback; + }; + + using HandlerList = std::vector; + using SharedList = std::shared_ptr; + + public: EventDispatcher() - : alive_(std::make_shared('\0')) + : handlers_(std::make_shared()), + alive_(std::make_shared('\0')) { } @@ -197,7 +233,9 @@ namespace DetourModKit * from any thread. * @return RAII Subscription guard. The handler is removed when the guard * is destroyed or reset(). - * @note Acquires exclusive lock. Do not call from within a handler. + * @note Copy-on-write: allocates a new handler list of size N+1. + * Acceptable for the expected mutation rate (startup and + * occasional reconfiguration). Do not call from within a handler. */ [[nodiscard]] Subscription subscribe(Handler handler) { @@ -210,8 +248,16 @@ namespace DetourModKit this->next_id_.fetch_add(1, std::memory_order_relaxed)); { - auto guard = std::unique_lock{this->mutex_}; - this->handlers_.push_back(Entry{id, std::move(handler)}); + std::scoped_lock lock{this->writer_mutex_}; + auto current = this->handlers_.load(std::memory_order_acquire); + auto next = std::make_shared(*current); + next->push_back(Entry{id, std::move(handler)}); + // Publish the new count first so a reader that sees 0 on the + // counter and skips the snapshot load cannot miss a handler + // that has already been installed in the snapshot. + this->handler_count_.store(next->size(), std::memory_order_release); + this->handlers_.store(std::shared_ptr(std::move(next)), + std::memory_order_release); } std::weak_ptr weak = this->alive_; @@ -223,9 +269,11 @@ namespace DetourModKit /** * @brief Emits an event to all subscribers. * @param event The event payload, passed by const reference to each handler. - * @note Acquires shared lock. Multiple threads may emit concurrently. - * Handlers are invoked synchronously in subscription order. - * Exceptions thrown by handlers propagate to the caller. + * @note Lock-free: performs one atomic acquire-load of the snapshot + * pointer and iterates. Multiple threads may emit concurrently + * without contention. Handlers are invoked synchronously in + * subscription order. Exceptions thrown by handlers propagate + * to the caller. * @warning If calling from a game hook callback or any context where an * unhandled exception would crash the host process, use * emit_safe() instead. emit() lets handler exceptions propagate @@ -234,9 +282,15 @@ namespace DetourModKit */ void emit(const Event &event) const { - auto lock = std::shared_lock{this->mutex_}; + // Fast path: no subscribers means no snapshot load at all. + if (this->handler_count_.load(std::memory_order_acquire) == 0) + { + return; + } + + SharedList snap = this->handlers_.load(std::memory_order_acquire); EmitGuard guard{emitting_depth()}; - for (const auto &entry : this->handlers_) + for (const auto &entry : *snap) { entry.callback(event); } @@ -245,17 +299,24 @@ namespace DetourModKit /** * @brief Emits an event, catching and discarding handler exceptions. * @param event The event payload. - * @note Same locking semantics as emit(). Handlers that throw are - * skipped; remaining handlers still execute. + * @note Same locking semantics as emit() (lock-free). Handlers that + * throw are skipped; remaining handlers still execute. * Prefer this over emit() when calling from hook callbacks or * other contexts where an unhandled exception would crash the * host process. */ void emit_safe(const Event &event) const noexcept { - auto lock = std::shared_lock{this->mutex_}; + if (this->handler_count_.load(std::memory_order_acquire) == 0) + { + return; + } + + // std::shared_ptr copy-construction and load are noexcept, so the + // entire function remains noexcept despite the per-handler catch. + SharedList snap = this->handlers_.load(std::memory_order_acquire); EmitGuard guard{emitting_depth()}; - for (const auto &entry : this->handlers_) + for (const auto &entry : *snap) { try { @@ -270,34 +331,54 @@ namespace DetourModKit /// Returns the number of active subscribers. [[nodiscard]] size_t subscriber_count() const noexcept { - auto guard = std::shared_lock{this->mutex_}; - return this->handlers_.size(); + return this->handler_count_.load(std::memory_order_acquire); } /// Returns true if there are no subscribers. [[nodiscard]] bool empty() const noexcept { - auto guard = std::shared_lock{this->mutex_}; - return this->handlers_.empty(); + return this->handler_count_.load(std::memory_order_acquire) == 0; } /** * @brief Removes all subscribers. - * @note Acquires exclusive lock. + * @note Serializes with other writers via the writer mutex; readers + * in flight keep their snapshot alive through their shared_ptr. */ void clear() noexcept { - auto guard = std::unique_lock{this->mutex_}; - this->handlers_.clear(); + std::scoped_lock lock{this->writer_mutex_}; + // Counter must go to 0 before publishing the empty snapshot so + // an emit that reads 0 on the fast-path counter cannot still see + // the non-empty old snapshot afterwards. + this->handler_count_.store(0, std::memory_order_release); + this->handlers_.store(std::make_shared(), + std::memory_order_release); } - private: - struct Entry +#if defined(DMK_EVENT_DISPATCHER_INTERNAL_TESTING) + /** + * @brief Test-only diagnostic: returns the number of outstanding + * references to the current handler snapshot, excluding the + * temporary this call itself creates. A value of 1 means the + * dispatcher's own atomic is the sole holder (steady state). + * A value >1 indicates an in-flight emit or a leaked snapshot + * reference. Enabled only when + * DMK_EVENT_DISPATCHER_INTERNAL_TESTING is defined by the + * test translation unit. Not part of the public API. + */ + [[nodiscard]] long debug_snapshot_use_count() const noexcept { - SubscriptionId id; - Handler callback; - }; + // load() returns a shared_ptr copy that bumps the refcount by 1 + // for its own lifetime; subtract that so the reported count + // reflects only the other holders (the dispatcher atomic and + // any in-flight emit snapshots). + auto snap = this->handlers_.load(std::memory_order_acquire); + return snap.use_count() - 1; + } +#endif + private: // Returns false when called from within a handler (reentrancy). // The Subscription::reset() caller retains its unsubscribe_ lambda // and will retry on the next reset() call (including the destructor) @@ -310,14 +391,32 @@ namespace DetourModKit return false; } - auto guard = std::unique_lock{this->mutex_}; - auto it = std::find_if(this->handlers_.begin(), this->handlers_.end(), + std::scoped_lock lock{this->writer_mutex_}; + auto current = this->handlers_.load(std::memory_order_acquire); + auto it = std::find_if(current->begin(), current->end(), [id](const Entry &e) { return e.id == id; }); - if (it != this->handlers_.end()) + if (it == current->end()) { - this->handlers_.erase(it); + // Not found; treat as successful (idempotent unsubscribe). return true; } + + auto next = std::make_shared(); + next->reserve(current->size() - 1); + for (const auto &entry : *current) + { + if (entry.id != id) + { + next->push_back(entry); + } + } + + // Publish snapshot first, then the counter. An emit that loads a + // stale snapshot containing the removed handler is still safe + // because the handler callable is retained by the old snapshot. + this->handlers_.store(std::shared_ptr(std::move(next)), + std::memory_order_release); + this->handler_count_.store(current->size() - 1, std::memory_order_release); return true; } @@ -345,9 +444,13 @@ namespace DetourModKit EmitGuard &operator=(EmitGuard &&) = delete; }; - mutable std::shared_mutex mutex_; - std::vector handlers_; + // alignas(64) keeps the hot atomics on their own cache line so the + // writer mutex and shared_ptr control-block traffic do not produce + // false sharing with readers doing the fast-path counter load. + alignas(64) mutable std::atomic handlers_; + std::atomic handler_count_{0}; std::atomic next_id_{1}; + std::mutex writer_mutex_; // serializes writers only std::shared_ptr alive_; // Prevents Subscription::reset() from calling // unsubscribe() after dispatcher destruction. }; diff --git a/src/hook_manager.cpp b/src/hook_manager.cpp index c58f775..8bda710 100644 --- a/src/hook_manager.cpp +++ b/src/hook_manager.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -151,10 +152,31 @@ HookManager::~HookManager() noexcept if (detail::is_loader_lock_held()) { detail::pin_current_module(); - static std::vector, detail::TransparentStringHash, std::equal_to<>>> s_leaked_hooks; - static std::vector>> s_leaked_vmt_hooks; - s_leaked_hooks.emplace_back(std::move(m_hooks)); - s_leaked_vmt_hooks.emplace_back(std::move(m_vmt_hooks)); + // Intentional leak under loader lock: draining readers or destroying + // Hook / VmtHookEntry values here can deadlock against another thread + // waiting on a loader callback. The pinned module keeps the SafetyHook + // trampoline pages live for the remainder of the process, so the leaked + // maps and their contents stay valid storage even though no one will + // ever observe them again. HookManager is a singleton so this branch + // runs at most once per process. + // + // Heap-allocate each map directly rather than nesting them inside an + // outer container. A container of containers would force the standard + // library to instantiate a copy-construction fallback for the element + // type whenever the element's move constructor is not unconditionally + // noexcept, and that copy path would try to copy a move-only member + // (VmtHookEntry owns a safetyhook::VmtHook). std::nothrow preserves + // the destructor's noexcept contract; on allocation failure the new + // expression returns nullptr without running the move constructor, + // the source maps keep their contents, and control falls through to + // the normal ~HookManager member destruction epilogue. That epilogue + // is best-effort under loader lock, but the pinned module still keeps + // trampoline code pages live so straggler trampoline calls land on + // valid memory. + using HookMap = std::unordered_map, detail::TransparentStringHash, std::equal_to<>>; + using VmtHookMap = std::unordered_map>; + [[maybe_unused]] auto *leaked_hooks = new (std::nothrow) HookMap(std::move(m_hooks)); + [[maybe_unused]] auto *leaked_vmt_hooks = new (std::nothrow) VmtHookMap(std::move(m_vmt_hooks)); m_shutdown_called.store(true, std::memory_order_release); return; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 59bf9e3..b31a329 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,99 +1,116 @@ cmake_minimum_required(VERSION 3.25) -# Fetch GoogleTest -include(FetchContent) -FetchContent_Declare( - googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG v1.14.0 -) - -# For Windows: Prevent overriding the parent project's compiler/linker settings -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) -FetchContent_MakeAvailable(googletest) - -# Suppress warnings from GoogleTest sources -foreach(_ext_target gtest gtest_main gmock gmock_main) - if(TARGET ${_ext_target}) - target_compile_options(${_ext_target} PRIVATE - $<$:-w> - $<$:/w> - ) - endif() -endforeach() - -# Test fixture DLL for cross-module hook integration tests -add_library(hook_target_lib SHARED - "${CMAKE_CURRENT_SOURCE_DIR}/fixtures/hook_target_lib.cpp" -) -set_target_properties(hook_target_lib PROPERTIES - PREFIX "" - OUTPUT_NAME "hook_target_lib" -) - -# Automatically collect all test source files -# This will include all test_*.cpp files and main.cpp -file(GLOB TEST_SOURCES - CONFIGURE_DEPENDS - "${CMAKE_CURRENT_SOURCE_DIR}/main.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/test_*.cpp" -) - -# Sort the list for deterministic build order -list(SORT TEST_SOURCES) - -message(STATUS "Discovered test files:") - -foreach(test_file ${TEST_SOURCES}) - message(STATUS " - ${test_file}") -endforeach() - -# Enable testing for the DetourModKit library -add_executable(DetourModKit_tests ${TEST_SOURCES}) - -# Ensure the hook target DLL is built before tests and placed alongside the test exe -add_dependencies(DetourModKit_tests hook_target_lib) -add_custom_command(TARGET hook_target_lib POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - $ - $ -) - -# Link GoogleTest and DetourModKit. -# psapi is needed by test_hook_integration.cpp (GetModuleInformation). -# user32 and xinput1_4 propagate transitively from DetourModKit (INTERFACE). -target_link_libraries(DetourModKit_tests - PRIVATE - DetourModKit - GTest::gtest_main - psapi -) - -# Include directories -target_include_directories(DetourModKit_tests PRIVATE - ${CMAKE_SOURCE_DIR}/include - ${CMAKE_SOURCE_DIR}/src -) - -# External dependency headers as SYSTEM to suppress their warnings -target_include_directories(DetourModKit_tests SYSTEM PRIVATE - ${CMAKE_SOURCE_DIR}/external/simpleini - ${DIRECTXMATH_INCLUDE_DIR} -) - -# Register tests with CTest -include(GoogleTest) -gtest_discover_tests(DetourModKit_tests DISCOVERY_MODE PRE_TEST) - -# Optional: Add code coverage if requested -option(DMK_ENABLE_COVERAGE "Enable code coverage reporting" OFF) - -if(DMK_ENABLE_COVERAGE) - if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") - # Instrument both the library and the test executable for coverage - target_compile_options(DetourModKit PRIVATE --coverage -fprofile-arcs -ftest-coverage) - target_link_options(DetourModKit PRIVATE --coverage) - target_compile_options(DetourModKit_tests PRIVATE --coverage -fprofile-arcs -ftest-coverage) - target_link_options(DetourModKit_tests PRIVATE --coverage) +if(DMK_BUILD_TESTS) + # Fetch GoogleTest + include(FetchContent) + FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG v1.14.0 + ) + + # For Windows: Prevent overriding the parent project's compiler/linker settings + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + FetchContent_MakeAvailable(googletest) + + # Suppress warnings from GoogleTest sources + foreach(_ext_target gtest gtest_main gmock gmock_main) + if(TARGET ${_ext_target}) + target_compile_options(${_ext_target} PRIVATE + $<$:-w> + $<$:/w> + ) + endif() + endforeach() + + # Test fixture DLL for cross-module hook integration tests + add_library(hook_target_lib SHARED + "${CMAKE_CURRENT_SOURCE_DIR}/fixtures/hook_target_lib.cpp" + ) + set_target_properties(hook_target_lib PROPERTIES + PREFIX "" + OUTPUT_NAME "hook_target_lib" + ) + + # Automatically collect all test source files + # This will include all test_*.cpp files and main.cpp + file(GLOB TEST_SOURCES + CONFIGURE_DEPENDS + "${CMAKE_CURRENT_SOURCE_DIR}/main.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/test_*.cpp" + ) + + # Sort the list for deterministic build order + list(SORT TEST_SOURCES) + + message(STATUS "Discovered test files:") + + foreach(test_file ${TEST_SOURCES}) + message(STATUS " - ${test_file}") + endforeach() + + # Enable testing for the DetourModKit library + add_executable(DetourModKit_tests ${TEST_SOURCES}) + + # Ensure the hook target DLL is built before tests and placed alongside the test exe + add_dependencies(DetourModKit_tests hook_target_lib) + add_custom_command(TARGET hook_target_lib POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + $ + $ + ) + + # Link GoogleTest and DetourModKit. + # psapi is needed by test_hook_integration.cpp (GetModuleInformation). + # user32 and xinput1_4 propagate transitively from DetourModKit (INTERFACE). + target_link_libraries(DetourModKit_tests + PRIVATE + DetourModKit + GTest::gtest_main + psapi + ) + + # Include directories + target_include_directories(DetourModKit_tests PRIVATE + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/src + ) + + # External dependency headers as SYSTEM to suppress their warnings + target_include_directories(DetourModKit_tests SYSTEM PRIVATE + ${CMAKE_SOURCE_DIR}/external/simpleini + ${DIRECTXMATH_INCLUDE_DIR} + ) + + # Register tests with CTest + include(GoogleTest) + gtest_discover_tests(DetourModKit_tests DISCOVERY_MODE PRE_TEST) + + # Optional: Add code coverage if requested + option(DMK_ENABLE_COVERAGE "Enable code coverage reporting" OFF) + + if(DMK_ENABLE_COVERAGE) + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + # Instrument both the library and the test executable for coverage + target_compile_options(DetourModKit PRIVATE --coverage -fprofile-arcs -ftest-coverage) + target_link_options(DetourModKit PRIVATE --coverage) + target_compile_options(DetourModKit_tests PRIVATE --coverage -fprofile-arcs -ftest-coverage) + target_link_options(DetourModKit_tests PRIVATE --coverage) + endif() endif() endif() + +if(DMK_BUILD_BENCHMARKS) + # Standalone microbench executable. Deliberately does NOT link gtest -- + # the benchmark is a raw main() that prints a TSV table to stdout, so it + # can run under any build configuration without the gtest runtime. + add_executable(DetourModKit_bench + "${CMAKE_CURRENT_SOURCE_DIR}/bench_event_dispatcher.cpp" + ) + + target_link_libraries(DetourModKit_bench PRIVATE DetourModKit) + + target_include_directories(DetourModKit_bench PRIVATE + ${CMAKE_SOURCE_DIR}/include + ) +endif() diff --git a/tests/bench_event_dispatcher.cpp b/tests/bench_event_dispatcher.cpp new file mode 100644 index 0000000..6055f5e --- /dev/null +++ b/tests/bench_event_dispatcher.cpp @@ -0,0 +1,251 @@ +/** + * @file bench_event_dispatcher.cpp + * @brief Standalone microbenchmark harness for EventDispatcher. + * + * Measures emit(), emit_safe(), subscribe + unsubscribe round-trip, and + * concurrent emit throughput. Uses only std::chrono::steady_clock so no + * extra dependency is pulled in. + * + * Build with -DDMK_BUILD_BENCHMARKS=ON. Executable name: DetourModKit_bench. + * + * Output is a tab-separated table on stdout. One row per metric. Columns: + * scenario, subscribers, iterations, median_ns_per_op, total_ms + * + * This binary is deliberately not a gtest: it is a separate executable so + * it can run under whatever build configuration the user wants (release, + * release+PGO, etc.) without dragging in the gtest runtime. + */ + +#include "DetourModKit/event_dispatcher.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace +{ + struct BenchEvent + { + int value{0}; + }; + + using Clock = std::chrono::steady_clock; + + // Compiler barrier / value sink. Prevents the optimizer from noticing + // that the handler result is unused and deleting the whole emit loop. + volatile std::uint64_t g_sink = 0; + + void noop_handler(const BenchEvent &e) noexcept + { + g_sink = g_sink + static_cast(e.value); + } + + // Runs `op` `iterations` times within a single sample, repeats the + // sample `samples` times, and returns the median wall time per sample + // in nanoseconds divided by iterations (i.e. per-op cost). + template + double median_ns_per_op(std::size_t iterations, std::size_t samples, Op &&op) + { + std::vector per_op; + per_op.reserve(samples); + + for (std::size_t s = 0; s < samples; ++s) + { + const auto start = Clock::now(); + for (std::size_t i = 0; i < iterations; ++i) + { + op(); + } + const auto end = Clock::now(); + const auto ns = std::chrono::duration_cast( + end - start) + .count(); + per_op.push_back(static_cast(ns) / static_cast(iterations)); + } + + std::sort(per_op.begin(), per_op.end()); + return per_op[per_op.size() / 2]; + } + + void bench_emit(std::size_t subscriber_count, + std::size_t iterations, + std::size_t samples, + const char *label, + bool use_safe) + { + DetourModKit::EventDispatcher dispatcher; + std::vector subs; + subs.reserve(subscriber_count); + for (std::size_t i = 0; i < subscriber_count; ++i) + { + subs.push_back(dispatcher.subscribe(&noop_handler)); + } + + const BenchEvent evt{42}; + const auto total_start = Clock::now(); + const double med = median_ns_per_op(iterations, samples, [&]() { + if (use_safe) + { + dispatcher.emit_safe(evt); + } + else + { + dispatcher.emit(evt); + } + }); + const auto total_end = Clock::now(); + + const auto total_ms = std::chrono::duration_cast( + total_end - total_start) + .count(); + + std::printf("%s\t%zu\t%zu\t%.2f\t%lld\n", + label, + subscriber_count, + iterations, + med, + static_cast(total_ms)); + } + + void bench_subscribe_unsubscribe(std::size_t iterations, std::size_t samples) + { + DetourModKit::EventDispatcher dispatcher; + + const auto total_start = Clock::now(); + const double med = median_ns_per_op(iterations, samples, [&]() { + auto sub = dispatcher.subscribe(&noop_handler); + // sub destroyed here: triggers unsubscribe + }); + const auto total_end = Clock::now(); + + const auto total_ms = std::chrono::duration_cast( + total_end - total_start) + .count(); + + std::printf("subscribe_unsub_roundtrip\t0\t%zu\t%.2f\t%lld\n", + iterations, + med, + static_cast(total_ms)); + } + + void bench_concurrent_emit(std::size_t thread_count, + std::size_t per_thread_iters, + std::size_t subscriber_count) + { + DetourModKit::EventDispatcher dispatcher; + std::vector subs; + subs.reserve(subscriber_count); + for (std::size_t i = 0; i < subscriber_count; ++i) + { + subs.push_back(dispatcher.subscribe(&noop_handler)); + } + + std::atomic go{false}; + std::vector workers; + workers.reserve(thread_count); + + const BenchEvent evt{7}; + + for (std::size_t t = 0; t < thread_count; ++t) + { + workers.emplace_back([&dispatcher, &go, &evt, per_thread_iters]() { + while (!go.load(std::memory_order_acquire)) + { + std::this_thread::yield(); + } + for (std::size_t i = 0; i < per_thread_iters; ++i) + { + dispatcher.emit(evt); + } + }); + } + + const auto start = Clock::now(); + go.store(true, std::memory_order_release); + for (auto &w : workers) + { + w.join(); + } + const auto end = Clock::now(); + + const auto total_ns = std::chrono::duration_cast( + end - start) + .count(); + const auto total_ops = thread_count * per_thread_iters; + const double per_op = + static_cast(total_ns) / static_cast(total_ops); + + std::printf("emit_concurrent_%zu_threads\t%zu\t%zu\t%.2f\t%lld\n", + thread_count, + subscriber_count, + total_ops, + per_op, + static_cast(total_ns / 1'000'000)); + } + + void bench_reentrancy_rejection(std::size_t iterations, std::size_t samples) + { + DetourModKit::EventDispatcher dispatcher; + + // Inside the handler, subscribe() will be rejected by the + // reentrancy guard. The cost measured here is the rejection path. + auto sub = dispatcher.subscribe([&dispatcher](const BenchEvent &) { + auto inner = dispatcher.subscribe(&noop_handler); + (void)inner; + }); + + const BenchEvent evt{0}; + + const auto total_start = Clock::now(); + const double med = median_ns_per_op(iterations, samples, [&]() { + dispatcher.emit(evt); + }); + const auto total_end = Clock::now(); + + const auto total_ms = std::chrono::duration_cast( + total_end - total_start) + .count(); + + std::printf("reentrancy_rejection\t1\t%zu\t%.2f\t%lld\n", + iterations, + med, + static_cast(total_ms)); + } +} // namespace + +int main() +{ + constexpr std::size_t samples = 11; // odd, so median is well-defined + + std::printf("scenario\tsubscribers\titerations\tmedian_ns_per_op\ttotal_ms\n"); + + // --- Single-thread emit() --- + bench_emit(0, 10'000'000, samples, "emit", false); + bench_emit(1, 5'000'000, samples, "emit", false); + bench_emit(8, 1'000'000, samples, "emit", false); + bench_emit(64, 200'000, samples, "emit", false); + + // --- Single-thread emit_safe() --- + bench_emit(0, 10'000'000, samples, "emit_safe", true); + bench_emit(1, 5'000'000, samples, "emit_safe", true); + bench_emit(8, 1'000'000, samples, "emit_safe", true); + bench_emit(64, 200'000, samples, "emit_safe", true); + + // --- Subscribe + unsubscribe round-trip --- + bench_subscribe_unsubscribe(100'000, samples); + + // --- Concurrent emit throughput --- + bench_concurrent_emit(4, 1'000'000, 8); + + // --- Reentrancy-rejection path --- + bench_reentrancy_rejection(500'000, samples); + + // Touch g_sink so the optimizer keeps the handler body. + std::printf("# sink=%llu\n", static_cast(g_sink)); + return 0; +} diff --git a/tests/test_event_dispatcher.cpp b/tests/test_event_dispatcher.cpp index 5d3df8d..ef7a61b 100644 --- a/tests/test_event_dispatcher.cpp +++ b/tests/test_event_dispatcher.cpp @@ -1,9 +1,16 @@ #include +// Enables the test-only debug_snapshot_use_count() diagnostic on the +// dispatcher. Defined here (before the header include) so it affects only +// this translation unit. +#define DMK_EVENT_DISPATCHER_INTERNAL_TESTING 1 + #include "DetourModKit/event_dispatcher.hpp" #include #include +#include +#include #include #include #include @@ -497,3 +504,127 @@ TEST(EventDispatcherTest, UnsubscribeInsideHandler_SucceedsOnDestruction) EXPECT_EQ(dispatcher.subscriber_count(), 0u); } +// --- Lock-free empty fast path --- + +TEST(EventDispatcherTest, EmptyFastPath_SkipsLock) +{ + // A freshly-constructed dispatcher holds exactly one snapshot reference + // internally. With no subscribers, emit() must be observably a no-op + // and subscriber_count()/empty() must report zero without mutating state. + EventDispatcher dispatcher; + + EXPECT_TRUE(dispatcher.empty()); + EXPECT_EQ(dispatcher.subscriber_count(), 0u); + + // Record the dispatcher's own reference to its handler snapshot before + // any emit. If the fast path really skips the snapshot load, emit() + // should not leave any residual references alive afterwards. + const long use_count_before = dispatcher.debug_snapshot_use_count(); + EXPECT_EQ(use_count_before, 1); + + for (int i = 0; i < 1000; ++i) + { + dispatcher.emit(SimpleEvent{i}); + dispatcher.emit_safe(SimpleEvent{i}); + } + + EXPECT_TRUE(dispatcher.empty()); + EXPECT_EQ(dispatcher.subscriber_count(), 0u); + EXPECT_EQ(dispatcher.debug_snapshot_use_count(), use_count_before); +} + +// --- Snapshot stability: in-flight emit sees pre-subscribe snapshot --- + +TEST(EventDispatcherTest, SnapshotStability_DuringEmit) +{ + EventDispatcher dispatcher; + + std::mutex gate_mtx; + std::condition_variable handler_started; + std::condition_variable handler_may_finish; + bool started{false}; + bool may_finish{false}; + + std::atomic old_calls{0}; + std::atomic new_calls{0}; + + // Pre-subscribed handler signals when the emit has begun, then blocks + // until the main thread allows it to continue. + auto old_sub = dispatcher.subscribe([&](const SimpleEvent &) { + { + std::lock_guard lk{gate_mtx}; + started = true; + } + handler_started.notify_one(); + + std::unique_lock lk{gate_mtx}; + handler_may_finish.wait(lk, [&] { return may_finish; }); + old_calls.fetch_add(1, std::memory_order_relaxed); + }); + + // Launch an emitter thread; it will block inside the pre-subscribed + // handler holding a shared_ptr snapshot of the current handler list. + std::thread emitter([&] { + dispatcher.emit(SimpleEvent{0}); + }); + + { + std::unique_lock lk{gate_mtx}; + handler_started.wait(lk, [&] { return started; }); + } + + // Subscribe a new handler while the emit is still in flight. + auto new_sub = dispatcher.subscribe([&](const SimpleEvent &) { + new_calls.fetch_add(1, std::memory_order_relaxed); + }); + + // The freshly-subscribed handler must not be visible to the in-flight + // emit, which captured its snapshot before new_sub was published. + EXPECT_EQ(new_calls.load(), 0); + + // Release the blocked handler and let the emit complete. + { + std::lock_guard lk{gate_mtx}; + may_finish = true; + } + handler_may_finish.notify_one(); + emitter.join(); + + EXPECT_EQ(old_calls.load(), 1); + EXPECT_EQ(new_calls.load(), 0); + + // The next emit publishes a snapshot that includes both handlers. + dispatcher.emit(SimpleEvent{1}); + EXPECT_EQ(old_calls.load(), 2); + EXPECT_EQ(new_calls.load(), 1); +} + +// --- Snapshot reclamation: no leaked shared_ptr references after churn --- + +TEST(EventDispatcherTest, SnapshotReclamation_NoLeak) +{ + EventDispatcher dispatcher; + + // Heavy subscribe + unsubscribe churn with interleaved emits. Each + // subscribe allocates a new snapshot; each unsubscribe does the same. + // Once every subscription's RAII guard is destroyed and no emit is in + // flight, only the dispatcher's own reference to the latest (empty) + // snapshot should remain. + constexpr int iterations = 10000; + for (int i = 0; i < iterations; ++i) + { + auto sub = dispatcher.subscribe([](const SimpleEvent &) {}); + if ((i & 0xFF) == 0) + { + dispatcher.emit(SimpleEvent{i}); + } + // sub destroyed here -- handler removed, new snapshot published + } + + EXPECT_EQ(dispatcher.subscriber_count(), 0u); + EXPECT_TRUE(dispatcher.empty()); + EXPECT_EQ(dispatcher.debug_snapshot_use_count(), 1) + << "Dispatcher should hold exactly one reference to its snapshot " + "after all subscriptions are released"; +} + diff --git a/tests/test_hook_manager.cpp b/tests/test_hook_manager.cpp index 99e91dd..3abc974 100644 --- a/tests/test_hook_manager.cpp +++ b/tests/test_hook_manager.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "DetourModKit/hook_manager.hpp" @@ -13,6 +14,31 @@ using namespace DetourModKit; +// VmtHookEntry owns a safetyhook::VmtHook and must be move-only. +// The HookManager destructor's loader-lock fallback path relies on these +// guarantees when storing VmtHookEntry values inside an unordered_map: any +// container operation that selects a copy fallback for VmtHookEntry would +// fail to compile, so guard the contract here. +static_assert(!std::is_copy_constructible_v, + "VmtHookEntry must remain non-copyable to preserve VmtHook ownership semantics."); +static_assert(!std::is_copy_assignable_v, + "VmtHookEntry must remain non-copy-assignable to preserve VmtHook ownership semantics."); +static_assert(std::is_move_constructible_v, + "VmtHookEntry must be move-constructible so it can live in standard containers."); +static_assert(std::is_move_assignable_v, + "VmtHookEntry must be move-assignable so it can live in standard containers."); + +// The loader-lock fallback in HookManager::~HookManager heap-allocates the +// move-constructed hook maps directly. Guard that the exact map types stay +// move-constructible so that path keeps compiling if the hasher, comparator, +// or value types are ever retuned. +using VmtHookMapForAsserts = std::unordered_map>; +using HookMapForAsserts = std::unordered_map, detail::TransparentStringHash, std::equal_to<>>; +static_assert(std::is_move_constructible_v, + "unordered_map must be move-constructible for the loader-lock leak path."); +static_assert(std::is_move_constructible_v, + "unordered_map, ...> must be move-constructible for the loader-lock leak path."); + class HookManagerTest : public ::testing::Test { protected: From 01bb72da77de5253f2f03bba158e2c5eeee13da9 Mon Sep 17 00:00:00 2001 From: Quang Trinh Date: Fri, 24 Apr 2026 04:25:14 +0700 Subject: [PATCH 2/3] docs(bench): use multi-run medians and correct version label Archive 5 runs per side under runs/, report medians with run-to-run spread as the noise floor. The prior single-run 64-subscriber regression was a statistical outlier (within 1% across 5 runs per side). Rename the directory to match the v3.1.0 release label. --- .../event_dispatcher_bench_v3.1.0/README.md | 108 ++++++++++++++++++ .../after.tsv | 0 .../before.tsv | 0 .../runs/new_1.tsv | 13 +++ .../runs/new_2.tsv | 13 +++ .../runs/new_3.tsv | 13 +++ .../runs/new_4.tsv | 13 +++ .../runs/new_5.tsv | 13 +++ .../runs/old_1.tsv | 13 +++ .../runs/old_2.tsv | 13 +++ .../runs/old_3.tsv | 13 +++ .../runs/old_4.tsv | 13 +++ .../runs/old_5.tsv | 13 +++ .../event_dispatcher_bench_v3.2.0/README.md | 106 ----------------- 14 files changed, 238 insertions(+), 106 deletions(-) create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/README.md rename docs/analysis/{event_dispatcher_bench_v3.2.0 => event_dispatcher_bench_v3.1.0}/after.tsv (100%) rename docs/analysis/{event_dispatcher_bench_v3.2.0 => event_dispatcher_bench_v3.1.0}/before.tsv (100%) create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_1.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_2.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_3.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_4.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_5.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_1.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_2.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_3.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_4.tsv create mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_5.tsv delete mode 100644 docs/analysis/event_dispatcher_bench_v3.2.0/README.md diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/README.md b/docs/analysis/event_dispatcher_bench_v3.1.0/README.md new file mode 100644 index 0000000..26c4a5b --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/README.md @@ -0,0 +1,108 @@ +# EventDispatcher Bench, v3.1.0 + +Before/after numbers for the lock-free COW snapshot `emit()` landed in v3.1.0. +The previous implementation used `std::shared_mutex` for `emit()` / `emit_safe()` +and an exclusive lock for `subscribe()` / `unsubscribe()`. The new implementation +stores handlers in a `std::atomic>>` +snapshot published on mutation, with a lock-free atomic handler-count fast +path for the zero-subscriber case. + +## Results (median of 5 runs per side) + +| Scenario | Subs | Before (ns/op) | After (ns/op) | Delta | Verdict | +| --------------------------- | ---: | -------------: | ------------: | ----------------- | :------ | +| `emit` | 0 | 103.9 | **6.0** | **-94.2% (17x)** | REAL | +| `emit` | 1 | 120.1 | 94.4 | **-21.4%** | REAL | +| `emit` | 8 | 245.6 | 216.3 | **-11.9%** | REAL | +| `emit` | 64 | 1103.5 | 1092.1 | -1.0% | NOISE | +| `emit_safe` | 0 | 103.1 | **5.7** | **-94.5% (18x)** | REAL | +| `emit_safe` | 1 | 118.6 | 96.4 | **-18.7%** | REAL | +| `emit_safe` | 8 | 233.2 | 219.1 | -6.0% | REAL | +| `emit_safe` | 64 | 1086.3 | 1099.8 | +1.2% | NOISE | +| `emit_concurrent_4_threads` | 8 | 517.9 | **248.2** | **-52.1% (2.1x)** | REAL | +| `subscribe_unsub_roundtrip` | — | 446.0 | 1150.4 | +158.0% | REAL | +| `reentrancy_rejection` | 1 | 212.5 | 192.7 | -9.4% | marginal| + +Verdict key: + +- **REAL**: median delta exceeds 1.5x the combined run-to-run spread on both sides. +- **NOISE**: median delta is smaller than the run-to-run spread; cannot be distinguished from measurement jitter. +- **marginal**: delta is larger than spread but smaller than 1.5x spread. + +Run-to-run coefficient of variation was 1% to 5% per scenario. Full per-run +TSVs live in [runs/](runs/) (5 OLD + 5 NEW). A representative single run per +side is preserved in [before.tsv](before.tsv) and [after.tsv](after.tsv) for +quick reference. + +## Interpretation + +**Zero-subscriber fast path.** The atomic handler-count short-circuit in +`emit()` / `emit_safe()` collapses a `shared_mutex` acquire/release plus +iteration setup into a single `memory_order_acquire` load of an 8-byte counter. +The 17x factor is the cost of an uncontended `shared_mutex` acquire/release +on Windows SRWLOCK relative to a naked atomic load, and it is the dominant +result for dispatchers that are wired up at init but rarely subscribed to. + +**1 to 8 subscriber uncontended emit.** Consistent wins (6% to 21%) from +removing the reader lock. The snapshot load is a release-acquire atomic plus +a `shared_ptr` refcount bump, which is cheaper than touching a mutex's state +word unconditionally. + +**Concurrent emit (4 threads, 8 subs).** 2.1x throughput. No reader lock +means no cache-line contention on the mutex state, so all four threads make +progress in parallel instead of serializing on the SRWLOCK read side. + +**64 subscriber emit.** Within noise on both `emit` (-1.0%) and `emit_safe` +(+1.2%). An earlier single-run measurement suggested an 18% regression; that +was a statistical outlier. Across 5 runs per side the two implementations +are indistinguishable at this subscriber count: the per-handler iteration +cost dominates and both paths reach the same `std::vector` buffer +layout through one extra dereference either way. + +**Subscribe / unsubscribe round-trip.** 2.6x slower (446 ns to 1150 ns). +Each mutation allocates a fresh handler vector, appends or removes the +entry, and publishes via atomic store. This is documented in the header +and is the accepted tradeoff for lock-free reads. Subscribe is not on a +hot path in any realistic mod workload. + +**Reentrancy rejection.** Marginal improvement (within 1.5x spread). Not a +meaningful claim; effectively unchanged. + +## Methodology + +- Host: Windows 11, MinGW `mingw-release` preset (GCC 13, libstdc++, -O3 LTO). +- CMake: `cmake --preset mingw-release -DDMK_BUILD_BENCHMARKS=ON -DDMK_BUILD_TESTS=OFF`. +- Build: `DetourModKit_bench` target only. No gtest linkage, no other test deps. +- Each sample runs N iterations of the scenario inside a single + `steady_clock::now()` pair. Reported value is the median per-op cost across + 11 samples inside one process invocation. Iteration counts are chosen so + each sample takes roughly the same wall time. +- 5 process invocations per side (OLD vs NEW), back-to-back, same machine, + same thermal state. Tables above report the median across those 5 runs + for each scenario. +- Verdicts use run-to-run spread (max minus min across the 5 runs) as the + noise floor. A claim is "REAL" only when the median delta exceeds 1.5x + that noise floor on both sides. + +## Reproduce + +```bash +cmake --preset mingw-release -DDMK_BUILD_BENCHMARKS=ON -DDMK_BUILD_TESTS=OFF +PATH="/c/msys64/mingw64/bin:$PATH" cmake --build build/mingw-release --target DetourModKit_bench --parallel +PATH="/c/msys64/mingw64/bin:$PATH" ./build/mingw-release/tests/DetourModKit_bench.exe > run.tsv +``` + +For a clean before/after comparison, bench the new implementation first, +copy the header aside, `git checkout HEAD -- include/DetourModKit/event_dispatcher.hpp` +to restore the baseline header, rebuild the `DetourModKit_bench` target, run +again into the baseline TSV, then restore the new header. Repeat N times +per side and compare medians with an explicit noise-floor check. + +## Caveat on committed TSVs + +The TSVs in this directory are raw artifacts from a specific host and +compiler version. They are not a stable baseline. Treat them as evidence +for the claims in this document, not as a regression gate. Future bench +runs should regenerate their own numbers and compare against the structure +of the results (17x fast-path win, 2x concurrent win, COW subscribe cost) +rather than the absolute nanosecond values. diff --git a/docs/analysis/event_dispatcher_bench_v3.2.0/after.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/after.tsv similarity index 100% rename from docs/analysis/event_dispatcher_bench_v3.2.0/after.tsv rename to docs/analysis/event_dispatcher_bench_v3.1.0/after.tsv diff --git a/docs/analysis/event_dispatcher_bench_v3.2.0/before.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/before.tsv similarity index 100% rename from docs/analysis/event_dispatcher_bench_v3.2.0/before.tsv rename to docs/analysis/event_dispatcher_bench_v3.1.0/before.tsv diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_1.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_1.tsv new file mode 100644 index 0000000..fe3b959 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_1.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 6.06 668 +emit 1 5000000 104.53 5715 +emit 8 1000000 220.31 2449 +emit 64 200000 1126.05 2521 +emit_safe 0 10000000 5.71 630 +emit_safe 1 5000000 97.17 5387 +emit_safe 8 1000000 219.06 2428 +emit_safe 64 200000 1120.66 2478 +subscribe_unsub_roundtrip 0 100000 1180.98 1313 +emit_concurrent_4_threads 8 4000000 245.09 980 +reentrancy_rejection 1 500000 203.29 1120 +# sink=23939455106 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_2.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_2.tsv new file mode 100644 index 0000000..3e75f60 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_2.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 6.00 663 +emit 1 5000000 96.19 5303 +emit 8 1000000 220.13 2431 +emit 64 200000 1116.99 2462 +emit_safe 0 10000000 5.69 624 +emit_safe 1 5000000 96.83 5311 +emit_safe 8 1000000 220.24 2438 +emit_safe 64 200000 1111.00 2443 +subscribe_unsub_roundtrip 0 100000 1156.80 1275 +emit_concurrent_4_threads 8 4000000 248.19 992 +reentrancy_rejection 1 500000 190.95 1072 +# sink=23940408562 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_3.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_3.tsv new file mode 100644 index 0000000..b6ae91c --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_3.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 6.08 675 +emit 1 5000000 94.44 5200 +emit 8 1000000 215.45 2394 +emit 64 200000 1092.06 2412 +emit_safe 0 10000000 5.79 641 +emit_safe 1 5000000 96.42 5376 +emit_safe 8 1000000 216.73 2395 +emit_safe 64 200000 1099.84 2487 +subscribe_unsub_roundtrip 0 100000 1150.42 1277 +emit_concurrent_4_threads 8 4000000 257.35 1029 +reentrancy_rejection 1 500000 192.65 1081 +# sink=23936874150 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_4.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_4.tsv new file mode 100644 index 0000000..a28d6b7 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_4.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 5.72 627 +emit 1 5000000 93.81 5154 +emit 8 1000000 216.31 2375 +emit 64 200000 1091.75 2408 +emit_safe 0 10000000 5.57 614 +emit_safe 1 5000000 95.42 5236 +emit_safe 8 1000000 220.80 2418 +emit_safe 64 200000 1095.09 2407 +subscribe_unsub_roundtrip 0 100000 1123.53 1244 +emit_concurrent_4_threads 8 4000000 235.05 940 +reentrancy_rejection 1 500000 188.63 1060 +# sink=23945296277 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_5.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_5.tsv new file mode 100644 index 0000000..c10abe2 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_5.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 5.80 642 +emit 1 5000000 92.25 5104 +emit 8 1000000 211.34 2341 +emit 64 200000 1085.13 2377 +emit_safe 0 10000000 5.67 618 +emit_safe 1 5000000 93.57 5143 +emit_safe 8 1000000 218.15 2393 +emit_safe 64 200000 1082.98 2385 +subscribe_unsub_roundtrip 0 100000 1127.63 1247 +emit_concurrent_4_threads 8 4000000 255.91 1023 +reentrancy_rejection 1 500000 193.98 1071 +# sink=23933756560 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_1.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_1.tsv new file mode 100644 index 0000000..b554536 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_1.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 106.61 11705 +emit 1 5000000 128.61 7036 +emit 8 1000000 249.45 2768 +emit 64 200000 1139.86 2500 +emit_safe 0 10000000 105.04 11582 +emit_safe 1 5000000 120.53 6652 +emit_safe 8 1000000 241.97 2673 +emit_safe 64 200000 1093.30 2430 +subscribe_unsub_roundtrip 0 100000 469.98 514 +emit_concurrent_4_threads 8 4000000 519.06 2076 +reentrancy_rejection 1 500000 203.05 1151 +# sink=24040673944 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_2.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_2.tsv new file mode 100644 index 0000000..9117d9d --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_2.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 106.19 11701 +emit 1 5000000 120.09 6623 +emit 8 1000000 245.57 2706 +emit 64 200000 1109.16 2439 +emit_safe 0 10000000 105.08 11571 +emit_safe 1 5000000 118.56 6567 +emit_safe 8 1000000 233.16 2585 +emit_safe 64 200000 1081.20 2374 +subscribe_unsub_roundtrip 0 100000 444.50 488 +emit_concurrent_4_threads 8 4000000 509.16 2036 +reentrancy_rejection 1 500000 212.53 1178 +# sink=24038786247 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_3.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_3.tsv new file mode 100644 index 0000000..c08fdc6 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_3.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 103.94 11593 +emit 1 5000000 120.79 6686 +emit 8 1000000 240.88 2707 +emit 64 200000 1079.81 2389 +emit_safe 0 10000000 103.12 11299 +emit_safe 1 5000000 118.56 6565 +emit_safe 8 1000000 232.03 2566 +emit_safe 64 200000 1109.64 2442 +subscribe_unsub_roundtrip 0 100000 445.98 492 +emit_concurrent_4_threads 8 4000000 510.86 2043 +reentrancy_rejection 1 500000 214.64 1186 +# sink=24041896088 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_4.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_4.tsv new file mode 100644 index 0000000..34489f0 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_4.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 101.87 11219 +emit 1 5000000 118.78 6579 +emit 8 1000000 247.11 2733 +emit 64 200000 1103.48 2483 +emit_safe 0 10000000 102.44 11260 +emit_safe 1 5000000 118.52 6520 +emit_safe 8 1000000 233.51 2566 +emit_safe 64 200000 1082.23 2385 +subscribe_unsub_roundtrip 0 100000 453.03 499 +emit_concurrent_4_threads 8 4000000 517.91 2071 +reentrancy_rejection 1 500000 215.26 1180 +# sink=24041522113 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_5.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_5.tsv new file mode 100644 index 0000000..b9b6895 --- /dev/null +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_5.tsv @@ -0,0 +1,13 @@ +scenario subscribers iterations median_ns_per_op total_ms +emit 0 10000000 103.69 11480 +emit 1 5000000 118.81 6567 +emit 8 1000000 239.97 2648 +emit 64 200000 1084.24 2409 +emit_safe 0 10000000 102.46 11257 +emit_safe 1 5000000 117.21 6484 +emit_safe 8 1000000 232.27 2557 +emit_safe 64 200000 1086.31 2374 +subscribe_unsub_roundtrip 0 100000 435.42 478 +emit_concurrent_4_threads 8 4000000 520.76 2083 +reentrancy_rejection 1 500000 208.94 1168 +# sink=24040143757 diff --git a/docs/analysis/event_dispatcher_bench_v3.2.0/README.md b/docs/analysis/event_dispatcher_bench_v3.2.0/README.md deleted file mode 100644 index c54f18e..0000000 --- a/docs/analysis/event_dispatcher_bench_v3.2.0/README.md +++ /dev/null @@ -1,106 +0,0 @@ -# EventDispatcher Bench, v3.2.0 - -Before/after numbers for the lock-free COW snapshot `emit()` landed in v3.2.0. -The previous implementation used `std::shared_mutex` for `emit()` / `emit_safe()` -and an exclusive lock for `subscribe()` / `unsubscribe()`. The new implementation -stores handlers in a `std::atomic>>` -snapshot published on mutation, with a lock-free atomic handler-count fast -path for the zero-subscriber case. - -## Results - -| Scenario | Subs | Before (ns/op) | After (ns/op) | Delta | -| --------------------------- | ---: | -------------: | ------------: | ----------------- | -| `emit` | 0 | 105.20 | **6.47** | **-94% (16.3x)** | -| `emit` | 1 | 126.23 | 106.85 | -15% | -| `emit` | 8 | 253.99 | 249.52 | -2% | -| `emit` | 64 | 1121.43 | 1324.66 | +18% (regression) | -| `emit_safe` | 0 | 103.55 | **6.32** | **-94% (16.4x)** | -| `emit_safe` | 1 | 119.27 | 106.76 | -10% | -| `emit_safe` | 8 | 231.13 | 208.92 | -10% | -| `emit_safe` | 64 | 1169.86 | 1077.59 | -8% | -| `subscribe_unsub_roundtrip` | 0 | 487.18 | 1125.23 | +131% (expected) | -| `emit_concurrent_4_threads` | 8 | 551.73 | **268.07** | **-51% (2.06x)** | -| `reentrancy_rejection` | 1 | 239.07 | 202.82 | -15% | - -Raw TSVs in [before.tsv](before.tsv) and [after.tsv](after.tsv). Each row is the -median of 11 samples. Iteration counts vary per row (10M for fast cases down to -200K for the slowest) to keep per-scenario wall time comparable. - -## Interpretation - -**Zero-subscriber fast path.** The atomic handler-count short-circuit in -`emit()` / `emit_safe()` collapses a `shared_mutex` acquire/release plus -iteration setup into a single `memory_order_acquire` load of an 8-byte counter. -The 16x factor is the cost of an uncontended `shared_mutex` acquire/release -on Windows SRWLOCK relative to a naked atomic load, and it is the dominant -result for dispatchers that are wired up at init but rarely subscribed to. - -**1 to 8 subscriber uncontended emit.** Small consistent wins (10% to 15%) -from removing the reader lock. The snapshot load is a release-acquire atomic -plus a `shared_ptr` refcount bump, which is cheaper than touching a mutex's -state word unconditionally. - -**Concurrent emit (4 threads, 8 subs).** 2.06x throughput. No reader lock -means no cache-line contention on the mutex state, so all four threads make -progress in parallel instead of serializing on the SRWLOCK read side. - -**64 subscriber emit, single thread.** 18% slower (+203 ns on a 1121 ns -baseline). Two plausible causes: - -1. Timer noise. On an 1100 ns run, 200 ns is 2-3 cycles worth of timer jitter - amplified across the sample; the noise floor on `steady_clock` is - typically in the tens of nanoseconds per sample. -2. `std::atomic` load cost dominates over the old loop's - single mutex acquire when amortized over only 64 handlers. libstdc++'s - implementation uses DWCAS (cmpxchg16b) on the snapshot atomic; MSVC - uses an internal spinlock. - -Typical DetourModKit usage (per the README: 1-10 subscribers per event, -dispatchers wired once at init) stays well inside the range where the -optimization is a pure win. The 64 subscriber row should be treated as a -worst-case indicator, not representative load. - -**Subscribe / unsubscribe round-trip.** 2.31x slower (487 ns to 1125 ns). -Each mutation allocates a fresh handler vector, appends or removes the -entry, and publishes via atomic store. This is documented in the header -and is the accepted tradeoff for lock-free reads. Subscribe is not on a -hot path in any realistic mod workload. - -**Concurrent emit, reentrancy rejection.** Small wins from the same -fast-path removal of the shared lock. - -## Methodology - -- Host: Windows 11, MinGW `mingw-release` preset (GCC 13, libstdc++, -O3 LTO). -- CMake: `cmake --preset mingw-release -DDMK_BUILD_BENCHMARKS=ON -DDMK_BUILD_TESTS=OFF`. -- Build: `DetourModKit_bench` target only. No gtest linkage, no other test deps. -- Each sample runs N iterations of the scenario inside a single - `steady_clock::now()` pair. Reported value is the median per-op cost across - 11 samples. Iteration counts are chosen so each sample takes roughly the - same wall time. -- Back-to-back runs, same machine, same process start, thermal state - comparable. Numbers are not hermetic; reruns on the same machine drift by - a few percent at this granularity. - -## Reproduce - -```bash -cmake --preset mingw-release -DDMK_BUILD_BENCHMARKS=ON -DDMK_BUILD_TESTS=OFF -PATH="/c/msys64/mingw64/bin:$PATH" cmake --build build/mingw-release --target DetourModKit_bench --parallel -PATH="/c/msys64/mingw64/bin:$PATH" ./build/mingw-release/tests/DetourModKit_bench.exe > after.tsv -``` - -For a clean before/after comparison, bench the new implementation first, -copy the header aside, `git checkout HEAD -- include/DetourModKit/event_dispatcher.hpp` -to restore the baseline header, rebuild the `DetourModKit_bench` target, run -again into `before.tsv`, then restore the new header. - -## Caveat on committed TSVs - -The `before.tsv` and `after.tsv` files in this directory are raw artifacts -from one run on one machine. They are not a stable baseline. Treat them as -evidence for the claims in this document, not as a regression gate. Future -bench runs should regenerate their own numbers and compare against the -structure of the results (16x fast-path win, 2x concurrent win, COW -subscribe cost) rather than the absolute nanosecond values. From 521aa6912954d2b777a5fd0d271e4a9744055703 Mon Sep 17 00:00:00 2001 From: Quang Trinh Date: Fri, 24 Apr 2026 04:37:10 +0700 Subject: [PATCH 3/3] fix: address PR review findings on event dispatcher PR - event_dispatcher: catch bad_alloc in noexcept unsubscribe()/clear(), leave state unchanged, return false for retry via Subscription::reset() - event_dispatcher + README: qualify "lock-free" claims to note that std::atomic may use an implementation-internal bit lock on toolchains without DWCAS (e.g. MSVC STL); the wait-free zero-subscriber fast path is unaffected - hook_manager: static_assert HookMap/VmtHookMap are nothrow-move- constructible so the noexcept loader-lock leak path cannot terminate - bench: replace volatile g_sink with std::atomic; make median_ns_per_op correct for even sample counts - test_hook_manager: tighten shutdown-blocks-killer race via an explicit killer-entered handoff before the EXPECT_FALSE assertion - coverage-pages.yml: enforce 80% line coverage gate via gcovr --fail-under-line 80 - bench docs: restore command references main, drop stale single-run before.tsv/after.tsv in favor of runs/ median --- .github/workflows/coverage-pages.yml | 1 + README.md | 5 +- .../event_dispatcher_bench_v3.1.0/README.md | 13 ++- .../event_dispatcher_bench_v3.1.0/after.tsv | 13 --- .../event_dispatcher_bench_v3.1.0/before.tsv | 13 --- include/DetourModKit/event_dispatcher.hpp | 94 ++++++++++++++----- src/hook_manager.cpp | 9 ++ tests/bench_event_dispatcher.cpp | 23 ++++- tests/test_hook_manager.cpp | 21 ++++- 9 files changed, 126 insertions(+), 66 deletions(-) delete mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/after.tsv delete mode 100644 docs/analysis/event_dispatcher_bench_v3.1.0/before.tsv diff --git a/.github/workflows/coverage-pages.yml b/.github/workflows/coverage-pages.yml index 6bc6712..2392608 100644 --- a/.github/workflows/coverage-pages.yml +++ b/.github/workflows/coverage-pages.yml @@ -85,6 +85,7 @@ jobs: --exclude "build/" \ --exclude "tests/" \ --gcov-ignore-parse-errors=negative_hits.warn_once_per_file \ + --fail-under-line 80 \ --print-summary \ --html-details coverage-report/index.html \ build/mingw-debug diff --git a/README.md b/README.md index 3f18fea..dac35cd 100644 --- a/README.md +++ b/README.md @@ -105,8 +105,9 @@ DetourModKit is a lightweight C++ toolkit designed to simplify common tasks in g - Typed pub/sub event system with RAII subscription management - Each `EventDispatcher` manages a single event type -- Lock-free `emit()` / `emit_safe()`: atomic acquire-load of a `std::shared_ptr` snapshot and iterate, no reader lock; `subscribe()` / `unsubscribe()` are copy-on-write under a small writer mutex -- Zero-subscriber fast path: `emit()` / `emit_safe()` short-circuit on a lock-free atomic handler counter, skipping the snapshot load entirely +- Reader-side lock-free fast path: `emit()` / `emit_safe()` acquire-load a `std::shared_ptr` snapshot and iterate with no reader lock; the snapshot load is genuinely lock-free on toolchains with a DWCAS-backed `std::atomic>` and may use an implementation-internal bit lock on toolchains that do not (for example MSVC's STL) +- Zero-subscriber fast path: `emit()` / `emit_safe()` short-circuit on a single `memory_order_acquire` counter load, skipping the snapshot load entirely (wait-free on every toolchain) +- `subscribe()` / `unsubscribe()` are copy-on-write under a small writer mutex - Subscriptions auto-unsubscribe on destruction - Handlers invoked in subscription order (preserved across unsubscribe) - Thread-local reentrancy guard detects and rejects subscribe/unsubscribe calls from within a handler, keeping the no-mutation-during-emit invariant intact diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/README.md b/docs/analysis/event_dispatcher_bench_v3.1.0/README.md index 26c4a5b..b2efd1e 100644 --- a/docs/analysis/event_dispatcher_bench_v3.1.0/README.md +++ b/docs/analysis/event_dispatcher_bench_v3.1.0/README.md @@ -30,9 +30,7 @@ Verdict key: - **marginal**: delta is larger than spread but smaller than 1.5x spread. Run-to-run coefficient of variation was 1% to 5% per scenario. Full per-run -TSVs live in [runs/](runs/) (5 OLD + 5 NEW). A representative single run per -side is preserved in [before.tsv](before.tsv) and [after.tsv](after.tsv) for -quick reference. +TSVs live in [runs/](runs/) (5 OLD + 5 NEW). ## Interpretation @@ -93,10 +91,11 @@ PATH="/c/msys64/mingw64/bin:$PATH" ./build/mingw-release/tests/DetourModKit_benc ``` For a clean before/after comparison, bench the new implementation first, -copy the header aside, `git checkout HEAD -- include/DetourModKit/event_dispatcher.hpp` -to restore the baseline header, rebuild the `DetourModKit_bench` target, run -again into the baseline TSV, then restore the new header. Repeat N times -per side and compare medians with an explicit noise-floor check. +copy the header aside, `git checkout main -- include/DetourModKit/event_dispatcher.hpp` +(or any pre-change commit) to restore the baseline header, rebuild the +`DetourModKit_bench` target, run again into the baseline TSV, then restore +the new header. Repeat N times per side and compare medians with an explicit +noise-floor check. ## Caveat on committed TSVs diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/after.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/after.tsv deleted file mode 100644 index 995d1d2..0000000 --- a/docs/analysis/event_dispatcher_bench_v3.1.0/after.tsv +++ /dev/null @@ -1,13 +0,0 @@ -scenario subscribers iterations median_ns_per_op total_ms -emit 0 10000000 6.47 718 -emit 1 5000000 106.85 5860 -emit 8 1000000 249.52 2756 -emit 64 200000 1324.66 2940 -emit_safe 0 10000000 6.32 696 -emit_safe 1 5000000 106.76 5727 -emit_safe 8 1000000 208.92 2341 -emit_safe 64 200000 1077.59 2401 -subscribe_unsub_roundtrip 0 100000 1125.23 1259 -emit_concurrent_4_threads 8 4000000 268.07 1072 -reentrancy_rejection 1 500000 202.82 1148 -# sink=23940842247 diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/before.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/before.tsv deleted file mode 100644 index 59b6007..0000000 --- a/docs/analysis/event_dispatcher_bench_v3.1.0/before.tsv +++ /dev/null @@ -1,13 +0,0 @@ -scenario subscribers iterations median_ns_per_op total_ms -emit 0 10000000 105.20 11778 -emit 1 5000000 126.23 6977 -emit 8 1000000 253.99 3539 -emit 64 200000 1121.43 2538 -emit_safe 0 10000000 103.55 11537 -emit_safe 1 5000000 119.27 6594 -emit_safe 8 1000000 231.13 2555 -emit_safe 64 200000 1169.86 2613 -subscribe_unsub_roundtrip 0 100000 487.18 545 -emit_concurrent_4_threads 8 4000000 551.73 2206 -reentrancy_rejection 1 500000 239.07 1326 -# sink=24038801584 diff --git a/include/DetourModKit/event_dispatcher.hpp b/include/DetourModKit/event_dispatcher.hpp index 14ea33b..301c272 100644 --- a/include/DetourModKit/event_dispatcher.hpp +++ b/include/DetourModKit/event_dispatcher.hpp @@ -10,20 +10,30 @@ * automatically unsubscribe on destruction. * * **Threading model:** - * - `emit()` / `emit_safe()` are lock-free on the hot path: they - * acquire-load a `std::shared_ptr>` - * snapshot and iterate it. Concurrent emits never contend. + * - `emit()` / `emit_safe()` avoid any `shared_mutex` / reader lock. + * The zero-subscriber fast path is wait-free: a single + * `memory_order_acquire` load of an atomic counter. When + * subscribers exist, an atomic acquire-load of a + * `std::shared_ptr>` snapshot is + * performed, then the contiguous handler vector is iterated. + * The snapshot load is genuinely lock-free on toolchains that + * provide a DWCAS-backed `std::atomic>` + * (for example libstdc++ on x86_64), and may use an + * implementation-internal short-critical-section bit lock on + * toolchains that do not (notably MSVC's STL). * - `subscribe()` / manual `unsubscribe()` serialize writers through * a small `std::mutex` and publish a new immutable snapshot via - * copy-on-write. + * copy-on-write. Mutation paths allocate; see the `subscribe()`, + * `unsubscribe()`, and `clear()` method docs for the OOM contract. * - Safe to emit from multiple threads concurrently (e.g., hook callbacks). * - Safe to subscribe/unsubscribe from any thread. * * **Performance characteristics:** * - `emit()`: atomic acquire-load of a `shared_ptr` snapshot, then - * linear iteration over the contiguous handler vector. No mutex - * acquisition on the hot path. When there are no subscribers, - * `emit()` skips the snapshot load entirely via an atomic counter. + * linear iteration over the contiguous handler vector. No + * user-visible mutex acquisition on the hot path. When there are + * no subscribers, `emit()` skips the snapshot load entirely via + * the atomic counter (wait-free fast path). * - `subscribe()` / `unsubscribe()`: copy-on-write. Each writer * allocates a new handler vector (O(n) in the current subscriber * count), appends or removes an entry, and publishes it atomically. @@ -169,11 +179,16 @@ namespace DetourModKit * @endcode * * **Thread safety:** - * - `emit()` / `emit_safe()`: lock-free. Acquires a `shared_ptr` snapshot - * of the immutable handler list and iterates it. + * - `emit()` / `emit_safe()`: the zero-subscriber fast path is wait-free + * (single atomic counter load). Otherwise acquires a `shared_ptr` + * snapshot of the immutable handler list and iterates it. The snapshot + * load avoids any reader lock; it is lock-free on toolchains with a + * DWCAS-backed `std::atomic>` and may use an + * implementation-internal bit lock on toolchains that do not. * - `subscribe()` / `unsubscribe()`: copy-on-write under a small writer * mutex. Each mutation allocates a new handler vector, appends or - * removes the entry, and publishes the new snapshot atomically. + * removes the entry, and publishes the new snapshot atomically. See + * the method docs for the OOM contract. * - Handlers are invoked while the snapshot's `shared_ptr` keeps the * vector alive. A thread-local reentrancy guard detects and rejects * subscribe/unsubscribe calls from within a handler; the guard is what @@ -344,16 +359,31 @@ namespace DetourModKit * @brief Removes all subscribers. * @note Serializes with other writers via the writer mutex; readers * in flight keep their snapshot alive through their shared_ptr. + * Allocates a fresh empty snapshot. On allocation failure the + * dispatcher state is left unchanged (best-effort no-op) so the + * noexcept contract is never violated by a throwing allocator. */ void clear() noexcept { std::scoped_lock lock{this->writer_mutex_}; + // Build the replacement snapshot before touching any published + // state so a throwing allocator leaves handlers_ / handler_count_ + // in their prior consistent pair. Swallowing bad_alloc keeps + // clear() a noexcept best-effort teardown. + std::shared_ptr empty_snap; + try + { + empty_snap = std::make_shared(); + } + catch (...) + { + return; + } // Counter must go to 0 before publishing the empty snapshot so // an emit that reads 0 on the fast-path counter cannot still see // the non-empty old snapshot afterwards. this->handler_count_.store(0, std::memory_order_release); - this->handlers_.store(std::make_shared(), - std::memory_order_release); + this->handlers_.store(std::move(empty_snap), std::memory_order_release); } #if defined(DMK_EVENT_DISPATCHER_INTERNAL_TESTING) @@ -379,11 +409,18 @@ namespace DetourModKit #endif private: - // Returns false when called from within a handler (reentrancy). - // The Subscription::reset() caller retains its unsubscribe_ lambda - // and will retry on the next reset() call (including the destructor) - // once the emit completes. This is safe because the alive_ weak_ptr - // prevents calling into a destroyed dispatcher. + // Returns false when called from within a handler (reentrancy) or + // when the replacement snapshot could not be allocated. The + // Subscription::reset() caller retains its unsubscribe_ lambda on + // false returns and will retry on the next reset() call (including + // the destructor). This is safe because the alive_ weak_ptr prevents + // calling into a destroyed dispatcher, and on allocation failure the + // published state is left untouched so the retry observes the same + // entry still present. + // + // Allocates (std::make_shared + vector growth). On OOM, leaves the + // dispatcher state unchanged and returns false so the RAII retry path + // handles it naturally. bool unsubscribe(SubscriptionId id) noexcept { if (emitting_depth() > 0) @@ -401,15 +438,28 @@ namespace DetourModKit return true; } - auto next = std::make_shared(); - next->reserve(current->size() - 1); - for (const auto &entry : *current) + // Build the replacement snapshot in full before touching any + // published state. A throwing allocator (reserve / push_back / + // make_shared) must not leave handlers_ and handler_count_ out + // of sync, and noexcept forbids propagation, so we catch + // bad_alloc and fall through to the false-return retry path. + std::shared_ptr next; + try { - if (entry.id != id) + next = std::make_shared(); + next->reserve(current->size() - 1); + for (const auto &entry : *current) { - next->push_back(entry); + if (entry.id != id) + { + next->push_back(entry); + } } } + catch (...) + { + return false; + } // Publish snapshot first, then the counter. An emit that loads a // stale snapshot containing the removed handler is still safe diff --git a/src/hook_manager.cpp b/src/hook_manager.cpp index 8bda710..c60137f 100644 --- a/src/hook_manager.cpp +++ b/src/hook_manager.cpp @@ -175,6 +175,15 @@ HookManager::~HookManager() noexcept // valid memory. using HookMap = std::unordered_map, detail::TransparentStringHash, std::equal_to<>>; using VmtHookMap = std::unordered_map>; + // If either map's move constructor could throw, the nothrow new + // expression above would leak its allocation and (since this + // destructor is noexcept) the throw would escalate to std::terminate. + // Pin the contract at compile time so any future change to the hash, + // key_equal, or mapped_type that breaks nothrow-move is caught here. + static_assert(std::is_nothrow_move_constructible_v, + "HookMap move ctor must be noexcept to keep the loader-lock leak path safe."); + static_assert(std::is_nothrow_move_constructible_v, + "VmtHookMap move ctor must be noexcept to keep the loader-lock leak path safe."); [[maybe_unused]] auto *leaked_hooks = new (std::nothrow) HookMap(std::move(m_hooks)); [[maybe_unused]] auto *leaked_vmt_hooks = new (std::nothrow) VmtHookMap(std::move(m_vmt_hooks)); m_shutdown_called.store(true, std::memory_order_release); diff --git a/tests/bench_event_dispatcher.cpp b/tests/bench_event_dispatcher.cpp index 6055f5e..556c31e 100644 --- a/tests/bench_event_dispatcher.cpp +++ b/tests/bench_event_dispatcher.cpp @@ -38,11 +38,16 @@ namespace // Compiler barrier / value sink. Prevents the optimizer from noticing // that the handler result is unused and deleting the whole emit loop. - volatile std::uint64_t g_sink = 0; + // Atomic so bench_concurrent_emit can fan out across threads without a + // data race on the sink; relaxed order because the numeric value is + // never read for synchronization, only printed after join() synchronizes + // with the producers. + std::atomic g_sink{0}; void noop_handler(const BenchEvent &e) noexcept { - g_sink = g_sink + static_cast(e.value); + g_sink.fetch_add(static_cast(e.value), + std::memory_order_relaxed); } // Runs `op` `iterations` times within a single sample, repeats the @@ -69,7 +74,15 @@ namespace } std::sort(per_op.begin(), per_op.end()); - return per_op[per_op.size() / 2]; + // Average the two middle samples on even counts so the helper + // returns the true median regardless of sample parity. Current + // callers use 11 samples (odd), but future callers may not. + const std::size_t n = per_op.size(); + if ((n % 2) == 0) + { + return (per_op[(n / 2) - 1] + per_op[n / 2]) / 2.0; + } + return per_op[n / 2]; } void bench_emit(std::size_t subscriber_count, @@ -246,6 +259,8 @@ int main() bench_reentrancy_rejection(500'000, samples); // Touch g_sink so the optimizer keeps the handler body. - std::printf("# sink=%llu\n", static_cast(g_sink)); + std::printf("# sink=%llu\n", + static_cast( + g_sink.load(std::memory_order_relaxed))); return 0; } diff --git a/tests/test_hook_manager.cpp b/tests/test_hook_manager.cpp index 3abc974..ae4e208 100644 --- a/tests/test_hook_manager.cpp +++ b/tests/test_hook_manager.cpp @@ -2453,17 +2453,28 @@ TEST_F(HookManagerTest, LateShutdown_DrainsReadersBeforeClearingMaps) // Spawn the shutdown racer. shutdown() must block on m_mutator_gate // until the reader releases the shared_lock, which proves the // destructor-equivalent ordering. + std::atomic killer_started{false}; std::atomic shutdown_returned{false}; std::thread killer([&]() { + // Publish entry into hm.shutdown() before the call so the main + // thread can wait on this flag instead of an unconditional sleep. + // Without this, shutdown_returned == false could mean "blocked on + // the mutator gate as intended" or "killer not scheduled yet", + // letting a premature-return regression slip through. + killer_started.store(true, std::memory_order_release); hm.shutdown(); shutdown_returned.store(true, std::memory_order_release); }); - // Give the killer a chance to start; it must not return while the - // reader holds shared_lock (shutdown path acquires exclusive on - // m_hooks_mutex after the shared disable pass). Assert that - // ordering directly so a premature-return regression fails the - // test even if reader_observed_valid still happens to hold. + // Wait until the killer has actually entered hm.shutdown() before we + // assert it is blocked. After that, a short sleep lets the + // m_mutator_gate acquisition attempt settle so we can observe the + // blocked state. The final assertion proves shutdown is held off + // while the reader still holds the shared_lock. + while (!killer_started.load(std::memory_order_acquire)) + { + std::this_thread::yield(); + } std::this_thread::sleep_for(std::chrono::milliseconds(20)); EXPECT_FALSE(shutdown_returned.load(std::memory_order_acquire)); reader_may_return.store(true, std::memory_order_release);