tkhquang · tkhquang · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.github/workflows/coverage-pages.yml b/.github/workflows/coverage-pages.yml
@@ -27,14 +27,10 @@ env:
   MINGW_BIN: C:\mingw64\bin
 
 jobs:
-  build-and-deploy:
-    name: Build, Test & Deploy Coverage
+  mingw-coverage:
+    name: MinGW Build, Test & Coverage Artifact
     runs-on: windows-latest
 
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -89,6 +85,7 @@ jobs:
             --exclude "build/" \
             --exclude "tests/" \
             --gcov-ignore-parse-errors=negative_hits.warn_once_per_file \
+            --fail-under-line 80 \
             --print-summary \
             --html-details coverage-report/index.html \
             build/mingw-debug
@@ -102,6 +99,43 @@ jobs:
         with:
           path: coverage-report
 
+  msvc-verify:
+    name: MSVC Build & Test
+    runs-on: windows-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up MSVC developer environment
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+
+      - name: Configure (MSVC Debug + Tests)
+        run: cmake --preset msvc-debug
+        shell: cmd
+
+      - name: Build
+        run: cmake --build --preset msvc-debug --parallel
+        shell: cmd
+
+      - name: Run Tests
+        run: ctest --preset msvc-debug
+        shell: cmd
+
+  deploy-pages:
+    name: Deploy Coverage Report
+    runs-on: ubuntu-latest
+    needs: [mingw-coverage, msvc-verify]
+
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+
+    steps:
       - name: Deploy to GitHub Pages
         id: deployment
         uses: actions/deploy-pages@v4
diff --git a/AGENTS.md b/AGENTS.md
@@ -167,7 +167,7 @@ pending_messages_.fetch_add(1, std::memory_order_acq_rel);
 
 ### Resource management and patterns
 
-- **RAII everywhere:** `std::unique_ptr`, `std::shared_ptr`, `std::lock_guard`, `std::scoped_lock`. No naked `new`/`delete` in application code. The only permitted exception is leak-on-purpose singletons to avoid the static destruction order fiasco (must be documented with a comment explaining why).
+- **RAII everywhere:** `std::unique_ptr`, `std::shared_ptr`, `std::lock_guard`, `std::scoped_lock`. No naked `new`/`delete` in application code. The only permitted exception is leak-on-purpose state to avoid teardown hazards -- specifically the static destruction order fiasco or deadlock when destruction would run under the Windows loader lock. Any such leak must be documented with a comment explaining why, must use `new (std::nothrow)` so the enclosing `noexcept` path stays honest, and must pin the current module so code pages referenced by the leaked state stay mapped (see `HookManager::~HookManager` and `Logger::shutdown_internal`).
 - **Rule of Zero/Five:** Prefer Rule of Zero (let compiler generate special members). When custom resource management is needed, implement all five special members. Delete copy/move when the type is non-copyable/non-movable.
 - **Atomic memory orderings:** Use the weakest correct ordering. `memory_order_relaxed` for counters and non-critical flags. `acquire`/`release` pairs for synchronization. Document why in comments only when the ordering is non-obvious.
 - **Lock ordering:** When acquiring multiple locks, document the order in the class header and follow it strictly. Example from `logger.hpp`: `1. async_mutex_` then `2. *log_mutex_ptr_`.
@@ -253,14 +253,14 @@ PATH="/c/msys64/mingw64/bin:$PATH" ./build/mingw-debug/tests/DetourModKit_tests.
 | Module | Thread safety | Hot-path mechanism |
 |--------|--------------|-------------------|
 | Scanner | Stateless -- inherently safe | N/A (startup only) |
-| HookManager | `shared_mutex` (readers) / `unique_lock` (writers); two-phase shutdown (disable under shared lock, clear under exclusive lock); `m_mutator_gate` (shared_mutex) blocks new mutators (including all VMT operations) during teardown; CAS on `m_shutdown_called` serializes shutdown/remove_all_hooks; double-checked fast-fail on `m_shutdown_called` in all mutators; destructor fallback (when `DMK_Shutdown()` was not called) acquires `m_mutator_gate` exclusively, flips `m_shutdown_called`, drains readers via exclusive `m_hooks_mutex`, then clears the maps -- under loader lock it pins the module and leaks the maps into a static vector instead of draining (mirrors the Logger::shutdown_internal pattern) | `shared_lock` for `with_inline_hook()` |
+| HookManager | `shared_mutex` (readers) / `unique_lock` (writers); two-phase shutdown (disable under shared lock, clear under exclusive lock); `m_mutator_gate` (shared_mutex) blocks new mutators (including all VMT operations) during teardown; CAS on `m_shutdown_called` serializes shutdown/remove_all_hooks; double-checked fast-fail on `m_shutdown_called` in all mutators; destructor fallback (when `DMK_Shutdown()` was not called) acquires `m_mutator_gate` exclusively, flips `m_shutdown_called`, drains readers via exclusive `m_hooks_mutex`, then clears the maps -- under loader lock it pins the module and move-constructs each map onto the heap via `new (std::nothrow)` so the storage outlives the destructor without ever draining, mirroring the leak-on-loader-lock discipline used in `Logger::shutdown_internal` | `shared_lock` for `with_inline_hook()` |
 | Logger | `atomic<shared_ptr>` for lock-free async reads; `shutdown_internal` is safe across repeated shutdown / enable_async_mode cycles: when the writer thread has to be detached under loader lock, the `shared_ptr<AsyncLogger>` is appended to a static `std::vector` rather than overwriting a single static slot, so prior handles are never dropped while their writer threads may still be running | Single atomic load on log level check |
 | AsyncLogger | Lock-free MPMC queue (Vyukov-style); post-join drain on shutdown (at most one message per producer can be lost in the nanosecond race between drain and force-zero -- accepted trade-off to avoid atomic overhead on every enqueue); timestamp caching in write batches | Atomic sequence numbers per slot |
 | InputPoller | Atomic `active_states_[]` array | `memory_order_relaxed` load per binding |
 | InputManager | `mutex` for lifecycle, `atomic<InputPoller*>` for reads | Lock-free `is_binding_active()` |
 | Memory cache | Sharded `SRWLOCK` + epoch-based shutdown | Shared reader locks per shard |
 | Config | `mutex` for registration; deferred setter invocation outside lock (no reentrancy guard needed -- setters may call back into Config) | N/A (startup only) |
-| EventDispatcher | `shared_mutex` -- shared lock for `emit()`/`emit_safe()`, exclusive lock for subscribe/unsubscribe; thread-local reentrancy guard rejects subscribe/unsubscribe from within handlers; `emit()` propagates handler exceptions, `emit_safe()` catches and skips them | `shared_lock` + contiguous vector iteration in subscription order |
+| EventDispatcher | Lock-free `emit()` / `emit_safe()` via `std::atomic<std::shared_ptr<const std::vector<Entry>>>` snapshot (copy-on-write publish, acquire-load on read); zero-subscriber fast path skips the snapshot load via an atomic handler counter; writers serialize on a small `std::mutex` that never touches the emit hot path; thread-local reentrancy guard rejects subscribe/unsubscribe from within handlers so the no-mutation-during-emit invariant holds; `emit()` propagates handler exceptions, `emit_safe()` catches and skips them | Atomic acquire-load of a `shared_ptr` snapshot plus linear iteration over a contiguous vector; no reader lock |
 | Profiler | Lock-free ring buffer via atomic `fetch_add` on write position; odd/even sequence counter per sample slot prevents torn reads during concurrent export -- the sequence is opened and closed with unconditional `fetch_add` (never a load-then-store) so concurrent producers racing on the same slot cannot roll the counter backwards; `DMK_PROFILE_SCOPE(name)` requires `name` to be a string literal, enforced at compile time by a `ScopedProfile` constructor that only binds to `const char (&)[N]` | Single atomic increment + sequence-guarded field writes per sample |
 
 ### Performance-critical paths

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -467,8 +467,19 @@ endif()
 # --- Unit Tests ---
 option(DMK_BUILD_TESTS "Build unit tests" OFF)
 
-if(DMK_BUILD_TESTS)
-  message(STATUS "Building unit tests...")
-  enable_testing()
+# --- Benchmarks ---
+# Opt-in microbenchmark executables (e.g. DetourModKit_bench). Gated so that
+# normal consumer builds do not produce extra targets. The bench sources live
+# alongside tests/ and are wired up in tests/CMakeLists.txt.
+option(DMK_BUILD_BENCHMARKS "Build benchmark executables" OFF)
+
+if(DMK_BUILD_TESTS OR DMK_BUILD_BENCHMARKS)
+  if(DMK_BUILD_TESTS)
+    message(STATUS "Building unit tests...")
+    enable_testing()
+  endif()
+  if(DMK_BUILD_BENCHMARKS)
+    message(STATUS "Building benchmarks...")
+  endif()
   add_subdirectory(tests)
 endif()
diff --git a/README.md b/README.md
@@ -105,13 +105,16 @@ DetourModKit is a lightweight C++ toolkit designed to simplify common tasks in g
 
 - Typed pub/sub event system with RAII subscription management
 - Each `EventDispatcher<Event>` manages a single event type
-- `shared_mutex` concurrency: concurrent `emit()` via shared lock, exclusive lock for subscribe/unsubscribe
+- Reader-side lock-free fast path: `emit()` / `emit_safe()` acquire-load a `std::shared_ptr<const vector>` snapshot and iterate with no reader lock; the snapshot load is genuinely lock-free on toolchains with a DWCAS-backed `std::atomic<std::shared_ptr<T>>` and may use an implementation-internal bit lock on toolchains that do not (for example MSVC's STL)
+- Zero-subscriber fast path: `emit()` / `emit_safe()` short-circuit on a single `memory_order_acquire` counter load, skipping the snapshot load entirely (wait-free on every toolchain)
+- `subscribe()` / `unsubscribe()` are copy-on-write under a small writer mutex
 - Subscriptions auto-unsubscribe on destruction
 - Handlers invoked in subscription order (preserved across unsubscribe)
-- Thread-local reentrancy guard detects and rejects subscribe/unsubscribe calls from within a handler, preventing deadlock
+- Thread-local reentrancy guard detects and rejects subscribe/unsubscribe calls from within a handler, keeping the no-mutation-during-emit invariant intact
 - Compose multiple dispatchers for multi-event architectures
 - `emit_safe()` for exception-tolerant dispatch (recommended for hook callbacks)
 - Safe when the dispatcher is destroyed before its subscriptions (weak_ptr guard)
+- Trade-off: `subscribe()` / `unsubscribe()` allocate a new handler list each call (O(n) publish). Suited for 1-10 subscribers per event and write-rarely access patterns, which matches typical mod usage
 
 </details>
 

diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/README.md b/docs/analysis/event_dispatcher_bench_v3.1.0/README.md
@@ -0,0 +1,107 @@
+# EventDispatcher Bench, v3.1.0
+
+Before/after numbers for the lock-free COW snapshot `emit()` landed in v3.1.0.
+The previous implementation used `std::shared_mutex` for `emit()` / `emit_safe()`
+and an exclusive lock for `subscribe()` / `unsubscribe()`. The new implementation
+stores handlers in a `std::atomic<std::shared_ptr<const std::vector<Entry>>>`
+snapshot published on mutation, with a lock-free atomic handler-count fast
+path for the zero-subscriber case.
+
+## Results (median of 5 runs per side)
+
+| Scenario                    | Subs | Before (ns/op) | After (ns/op) | Delta             | Verdict |
+| --------------------------- | ---: | -------------: | ------------: | ----------------- | :------ |
+| `emit`                      |    0 |          103.9 |       **6.0** | **-94.2% (17x)**  | REAL    |
+| `emit`                      |    1 |          120.1 |          94.4 | **-21.4%**        | REAL    |
+| `emit`                      |    8 |          245.6 |         216.3 | **-11.9%**        | REAL    |
+| `emit`                      |   64 |         1103.5 |        1092.1 | -1.0%             | NOISE   |
+| `emit_safe`                 |    0 |          103.1 |       **5.7** | **-94.5% (18x)**  | REAL    |
+| `emit_safe`                 |    1 |          118.6 |          96.4 | **-18.7%**        | REAL    |
+| `emit_safe`                 |    8 |          233.2 |         219.1 | -6.0%             | REAL    |
+| `emit_safe`                 |   64 |         1086.3 |        1099.8 | +1.2%             | NOISE   |
+| `emit_concurrent_4_threads` |    8 |          517.9 |     **248.2** | **-52.1% (2.1x)** | REAL    |
+| `subscribe_unsub_roundtrip` |    — |          446.0 |        1150.4 | +158.0%           | REAL    |
+| `reentrancy_rejection`      |    1 |          212.5 |         192.7 | -9.4%             | marginal|
+
+Verdict key:
+
+- **REAL**: median delta exceeds 1.5x the combined run-to-run spread on both sides.
+- **NOISE**: median delta is smaller than the run-to-run spread; cannot be distinguished from measurement jitter.
+- **marginal**: delta is larger than spread but smaller than 1.5x spread.
+
+Run-to-run coefficient of variation was 1% to 5% per scenario. Full per-run
+TSVs live in [runs/](runs/) (5 OLD + 5 NEW).
+
+## Interpretation
+
+**Zero-subscriber fast path.** The atomic handler-count short-circuit in
+`emit()` / `emit_safe()` collapses a `shared_mutex` acquire/release plus
+iteration setup into a single `memory_order_acquire` load of an 8-byte counter.
+The 17x factor is the cost of an uncontended `shared_mutex` acquire/release
+on Windows SRWLOCK relative to a naked atomic load, and it is the dominant
+result for dispatchers that are wired up at init but rarely subscribed to.
+
+**1 to 8 subscriber uncontended emit.** Consistent wins (6% to 21%) from
+removing the reader lock. The snapshot load is a release-acquire atomic plus
+a `shared_ptr` refcount bump, which is cheaper than touching a mutex's state
+word unconditionally.
+
+**Concurrent emit (4 threads, 8 subs).** 2.1x throughput. No reader lock
+means no cache-line contention on the mutex state, so all four threads make
+progress in parallel instead of serializing on the SRWLOCK read side.
+
+**64 subscriber emit.** Within noise on both `emit` (-1.0%) and `emit_safe`
+(+1.2%). An earlier single-run measurement suggested an 18% regression; that
+was a statistical outlier. Across 5 runs per side the two implementations
+are indistinguishable at this subscriber count: the per-handler iteration
+cost dominates and both paths reach the same `std::vector<Entry>` buffer
+layout through one extra dereference either way.
+
+**Subscribe / unsubscribe round-trip.** 2.6x slower (446 ns to 1150 ns).
+Each mutation allocates a fresh handler vector, appends or removes the
+entry, and publishes via atomic store. This is documented in the header
+and is the accepted tradeoff for lock-free reads. Subscribe is not on a
+hot path in any realistic mod workload.
+
+**Reentrancy rejection.** Marginal improvement (within 1.5x spread). Not a
+meaningful claim; effectively unchanged.
+
+## Methodology
+
+- Host: Windows 11, MinGW `mingw-release` preset (GCC 13, libstdc++, -O3 LTO).
+- CMake: `cmake --preset mingw-release -DDMK_BUILD_BENCHMARKS=ON -DDMK_BUILD_TESTS=OFF`.
+- Build: `DetourModKit_bench` target only. No gtest linkage, no other test deps.
+- Each sample runs N iterations of the scenario inside a single
+  `steady_clock::now()` pair. Reported value is the median per-op cost across
+  11 samples inside one process invocation. Iteration counts are chosen so
+  each sample takes roughly the same wall time.
+- 5 process invocations per side (OLD vs NEW), back-to-back, same machine,
+  same thermal state. Tables above report the median across those 5 runs
+  for each scenario.
+- Verdicts use run-to-run spread (max minus min across the 5 runs) as the
+  noise floor. A claim is "REAL" only when the median delta exceeds 1.5x
+  that noise floor on both sides.
+
+## Reproduce
+
+```bash
+cmake --preset mingw-release -DDMK_BUILD_BENCHMARKS=ON -DDMK_BUILD_TESTS=OFF
+PATH="/c/msys64/mingw64/bin:$PATH" cmake --build build/mingw-release --target DetourModKit_bench --parallel
+PATH="/c/msys64/mingw64/bin:$PATH" ./build/mingw-release/tests/DetourModKit_bench.exe > run.tsv
+```
+
+For a clean before/after comparison, bench the new implementation first,
+copy the header aside, `git checkout main -- include/DetourModKit/event_dispatcher.hpp`
+(or any pre-change commit) to restore the baseline header, rebuild the
+`DetourModKit_bench` target, run again into the baseline TSV, then restore
+the new header. Repeat N times per side and compare medians with an explicit
+noise-floor check.
+
+## Caveat on committed TSVs
+
+The TSVs in this directory are raw artifacts from a specific host and
+compiler version. They are not a stable baseline. Treat them as evidence
+for the claims in this document, not as a regression gate. Future bench
+runs should regenerate their own numbers and compare against the structure
+of the results (17x fast-path win, 2x concurrent win, COW subscribe cost)
+rather than the absolute nanosecond values.
diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_1.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_1.tsv
@@ -0,0 +1,13 @@
+scenario	subscribers	iterations	median_ns_per_op	total_ms
+emit	0	10000000	6.06	668
+emit	1	5000000	104.53	5715
+emit	8	1000000	220.31	2449
+emit	64	200000	1126.05	2521
+emit_safe	0	10000000	5.71	630
+emit_safe	1	5000000	97.17	5387
+emit_safe	8	1000000	219.06	2428
+emit_safe	64	200000	1120.66	2478
+subscribe_unsub_roundtrip	0	100000	1180.98	1313
+emit_concurrent_4_threads	8	4000000	245.09	980
+reentrancy_rejection	1	500000	203.29	1120
+# sink=23939455106
diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_2.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_2.tsv
@@ -0,0 +1,13 @@
+scenario	subscribers	iterations	median_ns_per_op	total_ms
+emit	0	10000000	6.00	663
+emit	1	5000000	96.19	5303
+emit	8	1000000	220.13	2431
+emit	64	200000	1116.99	2462
+emit_safe	0	10000000	5.69	624
+emit_safe	1	5000000	96.83	5311
+emit_safe	8	1000000	220.24	2438
+emit_safe	64	200000	1111.00	2443
+subscribe_unsub_roundtrip	0	100000	1156.80	1275
+emit_concurrent_4_threads	8	4000000	248.19	992
+reentrancy_rejection	1	500000	190.95	1072
+# sink=23940408562
diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_3.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_3.tsv
@@ -0,0 +1,13 @@
+scenario	subscribers	iterations	median_ns_per_op	total_ms
+emit	0	10000000	6.08	675
+emit	1	5000000	94.44	5200
+emit	8	1000000	215.45	2394
+emit	64	200000	1092.06	2412
+emit_safe	0	10000000	5.79	641
+emit_safe	1	5000000	96.42	5376
+emit_safe	8	1000000	216.73	2395
+emit_safe	64	200000	1099.84	2487
+subscribe_unsub_roundtrip	0	100000	1150.42	1277
+emit_concurrent_4_threads	8	4000000	257.35	1029
+reentrancy_rejection	1	500000	192.65	1081
+# sink=23936874150
diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_4.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_4.tsv
@@ -0,0 +1,13 @@
+scenario	subscribers	iterations	median_ns_per_op	total_ms
+emit	0	10000000	5.72	627
+emit	1	5000000	93.81	5154
+emit	8	1000000	216.31	2375
+emit	64	200000	1091.75	2408
+emit_safe	0	10000000	5.57	614
+emit_safe	1	5000000	95.42	5236
+emit_safe	8	1000000	220.80	2418
+emit_safe	64	200000	1095.09	2407
+subscribe_unsub_roundtrip	0	100000	1123.53	1244
+emit_concurrent_4_threads	8	4000000	235.05	940
+reentrancy_rejection	1	500000	188.63	1060
+# sink=23945296277
diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_5.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/new_5.tsv
@@ -0,0 +1,13 @@
+scenario	subscribers	iterations	median_ns_per_op	total_ms
+emit	0	10000000	5.80	642
+emit	1	5000000	92.25	5104
+emit	8	1000000	211.34	2341
+emit	64	200000	1085.13	2377
+emit_safe	0	10000000	5.67	618
+emit_safe	1	5000000	93.57	5143
+emit_safe	8	1000000	218.15	2393
+emit_safe	64	200000	1082.98	2385
+subscribe_unsub_roundtrip	0	100000	1127.63	1247
+emit_concurrent_4_threads	8	4000000	255.91	1023
+reentrancy_rejection	1	500000	193.98	1071
+# sink=23933756560
diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_1.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_1.tsv
@@ -0,0 +1,13 @@
+scenario	subscribers	iterations	median_ns_per_op	total_ms
+emit	0	10000000	106.61	11705
+emit	1	5000000	128.61	7036
+emit	8	1000000	249.45	2768
+emit	64	200000	1139.86	2500
+emit_safe	0	10000000	105.04	11582
+emit_safe	1	5000000	120.53	6652
+emit_safe	8	1000000	241.97	2673
+emit_safe	64	200000	1093.30	2430
+subscribe_unsub_roundtrip	0	100000	469.98	514
+emit_concurrent_4_threads	8	4000000	519.06	2076
+reentrancy_rejection	1	500000	203.05	1151
+# sink=24040673944
diff --git a/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_2.tsv b/docs/analysis/event_dispatcher_bench_v3.1.0/runs/old_2.tsv
@@ -0,0 +1,13 @@
+scenario	subscribers	iterations	median_ns_per_op	total_ms
+emit	0	10000000	106.19	11701
+emit	1	5000000	120.09	6623
+emit	8	1000000	245.57	2706
+emit	64	200000	1109.16	2439
+emit_safe	0	10000000	105.08	11571
+emit_safe	1	5000000	118.56	6567
+emit_safe	8	1000000	233.16	2585
+emit_safe	64	200000	1081.20	2374
+subscribe_unsub_roundtrip	0	100000	444.50	488
+emit_concurrent_4_threads	8	4000000	509.16	2036
+reentrancy_rejection	1	500000	212.53	1178
+# sink=24038786247