diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..14dcdd1
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,300 @@
+name: CI
+
+on:
+  push:
+    branches: [branch_libev, master]
+  pull_request:
+    branches: [branch_libev, master]
+  workflow_dispatch:
+
+permissions:
+  contents: write
+  deployments: write
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run correctness tests
+        run: make test
+
+      - name: Build benchmarks
+        run: make bench
+
+      - name: Run benchmarks
+        run: ./bench_udpspeeder
+
+      - name: Run benchmarks (JSON)
+        run: taskset -c 0 ./bench_udpspeeder --json
+
+      - name: Build production binary
+        run: make all
+
+      - name: Throughput test (io_uring)
+        run: bash bench/throughput.sh ./speederv2 --iterations 3 --duration 5
+
+      - name: Throughput test (recvfrom baseline)
+        run: UDPSPEEDER_NO_URING=1 bash bench/throughput.sh ./speederv2 --iterations 3 --duration 5
+
+      - name: Store benchmark results
+        if: github.ref == 'refs/heads/branch_libev'
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: UDPspeeder Benchmarks
+          tool: customSmallerIsBetter
+          output-file-path: bench_results.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true
+          alert-threshold: '115%'
+          comment-on-alert: true
+          fail-on-alert: false
+          benchmark-data-dir-path: dev/bench
+
+  build-static:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - name: x86_64
+            packages: ""
+            toolchain_url: ""
+            bench_target: bench-static
+            test_target: test-static
+            prod_target: all
+            make_args: ""
+            bench_bin: bench_udpspeeder_static
+            test_bin: test_udpspeeder_static
+            prod_bin: speederv2
+            qemu_cmd: ""
+          - name: aarch64
+            packages: g++-aarch64-linux-gnu qemu-user-static
+            toolchain_url: ""
+            bench_target: bench-cross
+            test_target: test-cross
+            prod_target: all-cross
+            make_args: "CC=aarch64-linux-gnu-g++"
+            bench_bin: bench_udpspeeder_cross
+            test_bin: test_udpspeeder_cross
+            prod_bin: speederv2_cross
+            qemu_cmd: qemu-aarch64-static
+          - name: mips
+            packages: g++-mips-linux-gnu qemu-user-static
+            toolchain_url: ""
+            bench_target: bench-cross
+            test_target: test-cross
+            prod_target: all-cross
+            make_args: "CC=mips-linux-gnu-g++"
+            bench_bin: bench_udpspeeder_cross
+            test_bin: test_udpspeeder_cross
+            prod_bin: speederv2_cross
+            qemu_cmd: qemu-mips-static
+          - name: powerpc
+            packages: qemu-user-static zstd
+            toolchain_url: "https://downloads.openwrt.org/releases/25.12.0-rc5/targets/mpc85xx/p1010/openwrt-toolchain-25.12.0-rc5-mpc85xx-p1010_gcc-14.3.0_musl.Linux-x86_64.tar.zst"
+            bench_target: bench-cross
+            test_target: test-cross
+            prod_target: all-cross
+            make_args: "SPE=1"
+            bench_bin: bench_udpspeeder_cross
+            test_bin: test_udpspeeder_cross
+            prod_bin: speederv2_cross
+            qemu_cmd: "qemu-ppc-static -cpu e500v2"
+          - name: riscv64
+            packages: qemu-user-static zstd
+            toolchain_url: "https://downloads.openwrt.org/releases/24.10.0/targets/sifiveu/generic/openwrt-toolchain-24.10.0-sifiveu-generic_gcc-13.3.0_musl.Linux-x86_64.tar.zst"
+            bench_target: bench-cross
+            test_target: test-cross
+            prod_target: all-cross
+            make_args: ""
+            bench_bin: bench_udpspeeder_cross
+            test_bin: test_udpspeeder_cross
+            prod_bin: speederv2_cross
+            qemu_cmd: qemu-riscv64-static
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install packages
+        if: matrix.packages != ''
+        run: sudo apt-get update && sudo apt-get install -y ${{ matrix.packages }}
+
+      - name: Download OpenWrt toolchain
+        if: matrix.toolchain_url != ''
+        run: |
+          curl -fSL "${{ matrix.toolchain_url }}" -o toolchain.tar.zst
+          mkdir -p /tmp/openwrt-toolchain
+          tar --zstd -xf toolchain.tar.zst -C /tmp/openwrt-toolchain --strip-components=1
+          OPENWRT_GXX=$(find /tmp/openwrt-toolchain -name '*-g++' -path '*/bin/*' | head -1)
+          echo "OPENWRT_GXX=${OPENWRT_GXX}" >> "$GITHUB_ENV"
+          echo "STAGING_DIR=/tmp/openwrt-toolchain" >> "$GITHUB_ENV"
+          echo "Found toolchain: ${OPENWRT_GXX}"
+
+      - name: Build bench (${{ matrix.name }})
+        run: |
+          ARGS="${{ matrix.make_args }}"
+          if [ -n "${OPENWRT_GXX:-}" ]; then
+            ARGS="CC=${OPENWRT_GXX} ${ARGS}"
+          fi
+          make ${{ matrix.bench_target }} ${ARGS}
+
+      - name: Build test (${{ matrix.name }})
+        run: |
+          ARGS="${{ matrix.make_args }}"
+          if [ -n "${OPENWRT_GXX:-}" ]; then
+            ARGS="CC=${OPENWRT_GXX} ${ARGS}"
+          fi
+          make ${{ matrix.test_target }} ${ARGS}
+
+      - name: Build production (${{ matrix.name }})
+        run: |
+          ARGS="${{ matrix.make_args }}"
+          if [ -n "${OPENWRT_GXX:-}" ]; then
+            ARGS="CC=${OPENWRT_GXX} ${ARGS}"
+          fi
+          make ${{ matrix.prod_target }} ${ARGS}
+
+      - name: Verify binaries
+        run: file ${{ matrix.bench_bin }} ${{ matrix.test_bin }} ${{ matrix.prod_bin }}
+
+      - name: Run tests (QEMU)
+        if: matrix.qemu_cmd != ''
+        run: ${{ matrix.qemu_cmd }} ./${{ matrix.test_bin }}
+
+      - name: Run benchmarks (QEMU)
+        if: matrix.qemu_cmd != ''
+        run: ${{ matrix.qemu_cmd }} ./${{ matrix.bench_bin }}
+
+      - name: Run benchmarks JSON (QEMU)
+        if: matrix.qemu_cmd != '' && matrix.name == 'powerpc'
+        run: ${{ matrix.qemu_cmd }} ./${{ matrix.bench_bin }} --json
+
+      - name: Store PPC benchmark results
+        if: matrix.name == 'powerpc' && github.ref == 'refs/heads/branch_libev'
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: UDPspeeder Benchmarks (PowerPC e500v2 via QEMU)
+          tool: customSmallerIsBetter
+          output-file-path: bench_results.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true
+          alert-threshold: '200%'
+          comment-on-alert: true
+          fail-on-alert: false
+          benchmark-data-dir-path: dev/bench-powerpc
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: udpspeeder-${{ matrix.name }}
+          path: |
+            ${{ matrix.bench_bin }}
+            ${{ matrix.test_bin }}
+            ${{ matrix.prod_bin }}
+            bench/profile.sh
+
+  interop:
+    runs-on: ubuntu-latest
+    needs: [build-static]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install QEMU
+        run: sudo apt-get update && sudo apt-get install -y qemu-user-static
+
+      - name: Download x86_64 artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: udpspeeder-x86_64
+          path: bin/x86_64
+
+      - name: Download aarch64 artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: udpspeeder-aarch64
+          path: bin/aarch64
+
+      - name: Download mips artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: udpspeeder-mips
+          path: bin/mips
+
+      - name: Download powerpc artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: udpspeeder-powerpc
+          path: bin/powerpc
+
+      - name: Download riscv64 artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: udpspeeder-riscv64
+          path: bin/riscv64
+
+      - name: Set executable permissions
+        run: chmod +x bin/*/speederv2 bin/*/speederv2_cross
+
+      - name: Run cross-architecture interop tests
+        run: |
+          set -e
+          PASS=0
+          FAIL=0
+
+          X86="bin/x86_64/speederv2"
+          ARM="qemu-aarch64-static bin/aarch64/speederv2_cross"
+          MIPS="qemu-mips-static bin/mips/speederv2_cross"
+          PPC="qemu-ppc-static -cpu e500v2 bin/powerpc/speederv2_cross"
+          RV64="qemu-riscv64-static bin/riscv64/speederv2_cross"
+
+          TESTS=(
+            "x86-server_arm-client|$X86|$ARM"
+            "arm-server_x86-client|$ARM|$X86"
+            "x86-server_mips-client|$X86|$MIPS"
+            "mips-server_x86-client|$MIPS|$X86"
+            "x86-server_ppc-client|$X86|$PPC"
+            "ppc-server_x86-client|$PPC|$X86"
+            "x86-server_rv64-client|$X86|$RV64"
+            "rv64-server_x86-client|$RV64|$X86"
+          )
+
+          CONFIGS=(
+            "--disable-fec|no-fec"
+            "--disable-fec --key testkey123|no-fec-key"
+            "--fec 20:10|fec-20-10"
+            "--fec 20:10 --key testkey123|fec-20-10-key"
+          )
+
+          for entry in "${TESTS[@]}"; do
+            IFS='|' read -r pair_name server_cmd client_cmd <<< "$entry"
+            for cfg in "${CONFIGS[@]}"; do
+              IFS='|' read -r cfg_args cfg_label <<< "$cfg"
+              label="${pair_name}/${cfg_label}"
+
+              echo ""
+              echo "=========================================="
+              echo "  TESTING: $label"
+              echo "=========================================="
+
+              if bash bench/interop.sh \
+                --server-cmd "$server_cmd" \
+                --client-cmd "$client_cmd" \
+                $cfg_args \
+                --label "$label" \
+                --packets 200; then
+                PASS=$((PASS + 1))
+              else
+                FAIL=$((FAIL + 1))
+                echo "^^^ FAILED: $label ^^^"
+              fi
+            done
+          done
+
+          echo ""
+          echo "=========================================="
+          echo "  INTEROP RESULTS: $PASS passed, $FAIL failed"
+          echo "=========================================="
+
+          if [[ $FAIL -ne 0 ]]; then
+            exit 1
+          fi
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..194d7d6
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,73 @@
+name: Release
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+permissions:
+  contents: write
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run tests
+        run: make test
+
+  build:
+    needs: test
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - name: x86_64
+            packages: ""
+            target: all
+            make_args: ""
+            src_bin: speederv2
+            release_bin: speederv2_linux_x86_64
+          - name: aarch64
+            packages: g++-aarch64-linux-gnu
+            target: all-cross
+            make_args: "CC=aarch64-linux-gnu-g++"
+            src_bin: speederv2_cross
+            release_bin: speederv2_linux_aarch64
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install toolchain
+        if: matrix.packages != ''
+        run: sudo apt-get update && sudo apt-get install -y ${{ matrix.packages }}
+
+      - name: Build production binary
+        run: make ${{ matrix.target }} ${{ matrix.make_args }}
+
+      - name: Verify binary
+        run: file ${{ matrix.src_bin }}
+
+      - name: Rename binary
+        run: mv ${{ matrix.src_bin }} ${{ matrix.release_bin }}
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.release_bin }}
+          path: ${{ matrix.release_bin }}
+
+  release:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download all artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+
+      - name: Create GitHub Release
+        uses: softprops/action-gh-release@v2
+        with:
+          files: artifacts/**/*
+          generate_release_notes: true
diff --git a/.github/workflows/throughput.yml b/.github/workflows/throughput.yml
new file mode 100644
index 0000000..3f9a9e9
--- /dev/null
+++ b/.github/workflows/throughput.yml
@@ -0,0 +1,67 @@
+name: Throughput
+
+on:
+  push:
+    branches: [branch_libev, master]
+  pull_request:
+    branches: [branch_libev, master]
+
+permissions:
+  contents: write
+
+jobs:
+  throughput:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout current branch
+        uses: actions/checkout@v4
+
+      - name: Fix script permissions
+        run: chmod +x bench/throughput.sh
+
+      - name: Build current binary
+        run: make all
+
+      - name: Rename current binary
+        run: mv speederv2 speederv2_current
+
+      - name: Checkout baseline
+        uses: actions/checkout@v4
+        with:
+          ref: baseline
+          path: baseline
+
+      - name: Build baseline binary
+        run: make -C baseline all
+
+      - name: Rename baseline binary
+        run: mv baseline/speederv2 speederv2_baseline
+
+      - name: Generate JSON results (current + baseline)
+        run: |
+          cur_nofec=$(./bench/throughput.sh ./speederv2_current --disable-fec --json)
+          cur_fec=$(./bench/throughput.sh ./speederv2_current --fec 20:10 --json)
+          base_nofec=$(./bench/throughput.sh ./speederv2_baseline --disable-fec --json)
+          base_fec=$(./bench/throughput.sh ./speederv2_baseline --fec 20:10 --json)
+
+          # Rename baseline entries to include "baseline/" prefix
+          base_nofec=$(echo "$base_nofec" | sed 's|"throughput/|"baseline/throughput/|')
+          base_fec=$(echo "$base_fec" | sed 's|"throughput/|"baseline/throughput/|')
+
+          printf '[%s, %s, %s, %s]\n' "$cur_nofec" "$cur_fec" "$base_nofec" "$base_fec" > throughput_results.json
+          cat throughput_results.json
+          python3 -c "import json; d=json.load(open('throughput_results.json')); [print(f\"  {e['name']}: {e['value']} {e['unit']}\") for e in d]"
+
+      - name: Store throughput results
+        if: github.ref == 'refs/heads/branch_libev'
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: UDPspeeder Throughput
+          tool: customBiggerIsBetter
+          output-file-path: throughput_results.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true
+          alert-threshold: '115%'
+          comment-on-alert: true
+          fail-on-alert: false
+          benchmark-data-dir-path: dev/throughput
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..dff2fb6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.claude/
+.gitmodules
diff --git a/OPTIMIZATION.md b/OPTIMIZATION.md
new file mode 100644
index 0000000..0e7a918
--- /dev/null
+++ b/OPTIMIZATION.md
@@ -0,0 +1,433 @@
+# UDPspeeder Optimization Results
+
+Benchmarked on Intel Core i5-7300U (Kaby Lake, 2C/4T, SSE4.2 + AVX2).
+Target platforms: Intel N150 (Alder Lake-N), Mediatek Filogic (ARMv8),
+TP-Link TL-WDR4900 (Freescale P1014 e500v2, PowerPC SPE).
+
+## End-to-end throughput (GitHub Actions, 1400B UDP, loopback)
+
+| Config | Baseline | Current | Improvement |
+|---|---|---|---|
+| no-fec | 618-861 Mbps | 942-1517 Mbps | **+48-76%** |
+| fec 20:10 | 364-509 Mbps | 657-1082 Mbps | **+81-113%** |
+
+Ranges reflect CI host variance between runs. Current/baseline measured on
+the same host within each run.
+
+FEC overhead dropped from ~41% of throughput (baseline) to ~30% (current).
+
+## Microbenchmark summary at 1500B
+
+| Path | Before | After | Speedup |
+|---|---|---|---|
+| addmul1 (GF multiply-accumulate) | 665 ns | 58 ns | 11.5x |
+| rs_encode k=10 n=15 | 34,000 ns | 3,100 ns | 11x |
+| rs_decode k=10 n=15 | 36,000 ns | 4,500 ns | 8x |
+| do_cook (CRC32C + obscure + XOR) | 3,000 ns | 713 ns | 4.2x |
+| cook XOR encryption only | 1,826 ns | 131 ns | 14x |
+
+## Commits
+
+### 1. Benchmark and test harness
+
+Nanobench-based microbenchmarks and correctness tests for FEC, CRC32,
+and packet cooking. Enables data-driven optimization and CI regression
+detection.
+
+### 2. CRC32C (Castagnoli) replacing CRC32 (zlib)
+
+Switched packet checksum from software CRC32 to CRC32C with hardware
+acceleration via SSE4.2 (`_mm_crc32_u64`) and ARMv8-CRC (`__crc32cb`).
+Software fallback for CPUs without hardware support.
+
+### 3. SSSE3 and NEON addmul1 vectorization
+
+The GF(2^8) multiply-accumulate (`addmul1`) is the inner loop of Reed-Solomon
+encode and decode. The scalar implementation uses a 64KB lookup table — one
+byte at a time.
+
+SSSE3/NEON use nibble decomposition: split each input byte into low/high
+nibbles, use `PSHUFB`/`TBL` as a 16-entry parallel lookup, XOR the results.
+Processes 16 bytes per iteration. Scalar tail for remainder.
+
+### 4. AVX2 addmul1 with runtime CPUID dispatch
+
+Same nibble-decomposition approach widened to 256-bit registers. `VPSHUFB`
+operates on two independent 128-bit lanes, so the 16-byte lookup table is
+duplicated into both lanes via `_mm256_broadcastsi128_si256`.
+
+Runtime dispatch: 3-phase CPUID check (OSXSAVE + XCR0 AVX state + leaf 7
+bit 5). Function pointer `addmul1_x86_fn` resolved in `init_fec()`.
+SSE tail handles the 16-31 byte remainder.
+
+### 5. Packet cook refactor into context struct
+
+Extracted the cook pipeline (CRC32C + XOR obfuscation + XOR encryption)
+from `packet.cpp` into self-contained `packet_cook.cpp` with a `cook_ctx_t`
+context struct. Eliminates 6 global variables. No dependency on common.h,
+libev, or networking code — enables benchmarking and testing in isolation.
+
+### 6. SSE2 and NEON XOR vectorization for cook pipeline
+
+The cook XOR loops (key encryption and IV obfuscation) were byte-at-a-time
+with data-dependent branches (`if (key[j] == 0) j = 0`). Compilers cannot
+auto-vectorize these.
+
+Pre-expand the repeating key/IV pattern into a tile whose length is
+`lcm(pattern_len, 16)`, then XOR 16 bytes at a time with SSE2 or NEON.
+Key tile is computed once at startup (`cook_ctx_prepare_key`), IV tile is
+built per-packet on the stack (4-32 bytes, tile at most 496 bytes).
+
+### 7. Eliminate per-call malloc in fec_decode
+
+`fec_decode` performed 7+ malloc/free pairs per call. Replaced with:
+- `invert_mat`: 5 heap buffers replaced with stack VLAs (max 3.6 KB)
+- `build_decode_matrix`: k*k matrix pre-allocated in `fec_parms` struct
+- `fec_decode`: per-row data buffers replaced with contiguous scratch
+  in `fec_parms`, lazily grown on first use
+
+Eliminates allocation jitter on the real-time decode path.
+
+### 8. io_uring multishot receive with provided buffer rings
+
+Replaced per-packet `recvfrom()` / `recv()` syscalls with io_uring
+multishot receive using kernel-managed provided buffer rings. The kernel
+fills pre-registered buffers and posts completions to a shared ring —
+userspace drains batches of completions without syscalls per packet.
+
+Key implementation details:
+- **Multishot recvmsg** for unconnected sockets (server local, client local)
+  with `IORING_OP_RECVMSG` + `IORING_RECV_MULTISHOT` + `IOSQE_BUFFER_SELECT`
+- **Multishot recv** for connected sockets (server remote, client remote)
+- **Provided buffer ring** (`IORING_REGISTER_PBUF_RING`): 256 buffers,
+  power-of-2 ring, kernel picks buffers without userspace involvement
+- **Batched CQ drain**: single `acquire` load on CQ tail, process all ready
+  CQEs, single `release` store to advance CQ head
+- **Batched buffer recycling**: deferred ring entries with single atomic
+  tail commit per batch
+- **Combined submit+flush**: `io_uring_enter(IORING_ENTER_SQ_WAKEUP |
+  IORING_ENTER_GETEVENTS)` — one syscall for SQE submission and CQE
+  materialization
+- **Zero-copy paths**: all four socket paths process directly from provided
+  buffers. recvmsg paths (unconnected) have 140+ bytes natural headroom.
+  recv paths (connected) use `URING_RECV_HEADROOM` (4 bytes) reserved before
+  each buffer for in-place conv header insertion, eliminating per-packet memcpy.
+- **COOP_TASKRUN + SINGLE_ISSUER** flags with fallback for older kernels
+- **CQ ring sized 4x buffer count** to avoid multishot stalls
+- Graceful fallback to `recvfrom()` on older kernels or non-Linux
+
+Bugs fixed during development:
+- `io_uring_recvmsg_out` payload offset must use template `msg_namelen`
+  (128 bytes for `sockaddr_storage`), not `hdr->namelen` (actual, e.g. 16)
+- CQ tail read requires `acquire` barrier (correctness on ARM/NEON targets)
+- Ring fd notification gap after CQ drain: explicit `IORING_ENTER_GETEVENTS`
+  flush needed to materialize deferred completions
+
+GitHub Actions throughput (no-fec, 1400B UDP, loopback):
+
+| Path | Median Mbps | Runs |
+|---|---|---|
+| io_uring multishot | 798.5 | 784.8, 798.5, 837.8 |
+| recvfrom baseline | 629.9 | 627.6, 629.9, 654.0 |
+| **Improvement** | **+27%** | |
+
+### 9. sendmmsg batching for FEC output
+
+Replaced per-packet `sendto()` calls after FEC encoding with a single
+`sendmmsg()` call per batch. When the delay manager detects all output
+delays are zero (the common case), it routes through `my_send_batch()`
+which cooks all packets then issues one `sendmmsg()` for the entire batch.
+
+Typically 20-30 packets per FEC batch → 20-30 syscalls reduced to 1.
+
+### 10. Flat array replacing std::map in FEC decode
+
+Replaced `std::map<int, fec_data_t*>` shard index in `fec_decode_manager_t`
+with a flat pre-allocated array indexed directly by shard position. Eliminates
+per-shard tree traversal (O(log n) → O(1)) and per-node heap allocation.
+
+### 11. Zero-copy io_uring recv for conv header
+
+The conv header (4 bytes) was previously inserted via `memmove` after receive.
+Reserved `URING_RECV_HEADROOM` (4 bytes) before each provided buffer at
+registration time. Recv paths now write the conv header directly into the
+headroom, avoiding any per-packet memcpy/memmove.
+
+### 12. Anti-replay direct-mapped table
+
+Replaced `unordered_map<u32_t, info_t>` + `u64_t[30000]` ring buffer (~2 MB
+scattered) with a `u32_t[32768]` direct-mapped table (128 KB contiguous).
+
+Design: `table[seq & (SIZE-1)] = seq` to mark seen, `table[seq & (SIZE-1)] != seq`
+to check validity. Power-of-2 size for bitwise modulo. Old entries naturally
+evicted when new seqs map to the same slot. Effective replay window ~32K groups,
+comparable to the old 30K ring buffer. No timeout logic needed.
+
+Per-shard cost: single array access + compare vs hash computation + pointer chase.
+
+### 13. Flat group table in FEC decode with bitmap shard tracking
+
+Replaced `unordered_map<u32_t, fec_group_t>` with a pre-allocated direct-mapped
+array sized `next_pow2(fec_buff_num * 2)`. Safe because FEC sequence numbers
+are monotonically increasing — consecutive seqs map to distinct slots, so no
+two concurrent groups collide when table size exceeds max concurrent groups.
+
+Shard tracking uses a 32-byte bitmap (`u32_t[8]`, 256 bits) instead of
+`memset(shard_idx, -1, 1024)`. Only the bitmap is cleared on group creation
+(32 bytes vs 1024 bytes). The `shard_idx[]` array is only accessed through
+`has_shard()`/`set_shard()` which check the bitmap first.
+
+Eliminates ~50K malloc/free pairs per second at line rate (one allocation per
+FEC group for the map node containing the ~1 KB `fec_group_t`).
+
+### 14. Final hot-path micro-optimizations
+
+Replaced per-struct `memset(&msgs[i], 0, sizeof(msgs[i]))` (~64 bytes)
+in the sendmmsg loop with targeted field writes (6 fields, ~48 bytes
+skipped per struct × 20-30 packets per FEC batch). Merged two separate
+loops in type==1 FEC decode (pre-init + populate) into one. Added early
+`pad > 0` check to skip shard padding memset when the shard is already
+at max_len.
+
+Each individually below CI noise floor. Combined: 1-3%.
+
+### 15. PowerPC e500v2 SPE XOR for cook pipeline
+
+Added SPE (Signal Processing Extension) assembly for the XOR cook stage on
+PowerPC e500v2 (Freescale P1014, used in TP-Link TL-WDR4900 running OpenWrt).
+
+The e500v2 has no AltiVec/VMX. SPE provides 64-bit operations: `evldd`/`evstdd`
+(8-byte aligned load/store) and `evxor` (64-bit XOR). GCC 9+ removed SPE
+intrinsics, so this is standalone assembly (`xor_spe.S`) following the Linux
+kernel pattern (`arch/powerpc/crypto/aes-spe-core.S`).
+
+Implementation details:
+- **4x unrolled main loop**: 32 bytes/iteration with `evldd`/`evxor`/`evstdd`
+- **Alignment handling**: scalar head loop until data is 8-byte aligned,
+  8-byte tail loop, then byte tail for remainder
+- **Tile wrap**: offset tracking with compare-and-reset per doubleword
+- **Build guard**: `HAVE_PPC_SPE` define, set via `make SPE=1`
+- **Word-width generic fallback**: non-SPE generic platforms (MIPS, RISC-V,
+  ARMv7) now use `sizeof(unsigned long)` XOR instead of byte-at-a-time
+
+PPC assembly gotchas fixed during development:
+- OpenWrt binutils 2.44 requires `%r` register prefix (`%r0`, `%r3`); bare
+  `r0` is treated as a symbol reference ("unsupported relocation" errors)
+- PPC r0-as-zero: `addi rD, r0, imm` treats r0 as literal 0, not the
+  register. Fixed by using `addic` (no r0 special case, but clobbers XER[CA])
+- `evldd` reads 8 bytes at tile+offset; after unaligned head loop, offset
+  can be 1-7, straddling tile boundary. Fixed with `COOK_VEC_WIDTH` padding
+  bytes at end of tile buffers
+
+SPE only helps the XOR stage of cook. It cannot help `addmul1` (requires
+byte-level shuffle, absent on SPE) or CRC32C (no hardware CRC on e500v2).
+
+PowerPC e500v2 microbenchmarks (QEMU, GitHub Actions):
+
+| Path | Baseline | Current | Speedup |
+|---|---|---|---|
+| crc32c/1500B (sw slicing-by-8) | 2,609 ns | 1,804 ns | **1.4x** |
+| rs_encode k=10 n=15 | 182,166 ns | 123,385 ns | **1.5x** |
+| rs_decode k=10 n=15 | 169,905 ns | 133,963 ns | **1.3x** |
+| addmul1/1500B (scalar) | 2,453 ns | 2,447 ns | 1.0x |
+
+RS encode/decode improvement is from pre-allocated decode buffers (#7), not
+SPE. CRC32C improvement is from switching CRC32-zlib to CRC32C-Castagnoli
+(software slicing-by-8 table, #2). addmul1 is identical (both scalar).
+
+Cook pipeline numbers (current only, no baseline cook tests):
+
+| Path | PPC (QEMU) ns |
+|---|---|
+| do_cook/1500B | 4,090 |
+| de_cook/1500B | 3,971 |
+| cook_xor_only/1500B | 1,091 |
+| cook_obscure_only/1500B | 1,415 |
+| cook_crc32_only/1500B | 1,983 |
+
+Files: `xor_spe.S` (new), `packet_cook.cpp`, `makefile`, `.github/workflows/ci.yml`
+
+## Analysis and diminishing returns
+
+After 15 optimizations, the codebase is within 10% of the theoretical
+floor (see below). Remaining overhead is irreducible:
+
+**Syscall overhead**: Eliminated. io_uring multishot recv batches receives
+without per-packet syscalls. sendmmsg batches sends.
+
+**Compute hotspots**: SIMD-vectorized. GF(2^8) multiply-accumulate uses
+AVX2/SSSE3/NEON. CRC32C uses hardware instructions. XOR cook uses
+SSE2/NEON/SPE (PPC e500v2). Word-width fallback for generic platforms.
+
+**Allocation overhead**: Eliminated from hot paths. FEC decode uses pre-allocated
+buffers, flat arrays, and direct-mapped tables. No `malloc`/`free` per packet
+or per group.
+
+**Memory copies**: Two per-packet memcpy remain and are architecturally necessary:
+1. `blob_encode_t::input()` — packets must be packed contiguously before RS can
+   slice them into equal-length shards. Shard boundaries aren't known until the
+   batch is complete (depends on total data size and optimal data_num selection).
+2. `fec_decode_manager_t::input()` — received shards must be copied into owned
+   buffers because RS decode modifies data in-place.
+
+Each copies ~1400 bytes per packet. At 1.5M packets/sec, that's ~4 GB/sec of
+memcpy bandwidth — real but fundamental to the FEC architecture.
+
+## Theoretical FEC overhead floor
+
+For a given FEC config k:r (k data shards, r redundant, n=k+r total), the
+minimum per-packet overhead has three irreducible components:
+
+### 1. Wire amplification
+
+Every k application packets produce n=k+r packets on the wire. Goodput
+cannot exceed k/n of wire capacity regardless of CPU speed.
+
+| Config | k | r | n | Wire overhead |
+|---|---|---|---|---|
+| fec 20:10 | 20 | 10 | 30 | **33% lost** (goodput ≤ 67% of no-fec) |
+| fec 10:5 | 10 | 5 | 15 | **33% lost** |
+| fec 5:3 | 5 | 3 | 8 | **38% lost** (goodput ≤ 63%) |
+
+This is information-theoretic: you must transmit r/k extra data.
+
+### 2. RS encode compute (per application packet)
+
+Encode generates r parity shards. Each parity shard requires k addmul1
+calls over shard_len bytes (`lib/fec.cpp:940-944`). Per batch of k packets:
+
+    total_addmul1 = r × k    calls at shard_len ≈ 1400 bytes
+    per_app_packet = r        addmul1(shard_len) calls
+
+| Config | addmul1 per pkt | ns/pkt (x86 AVX2) | ns/pkt (PPC scalar) |
+|---|---|---|---|
+| fec 20:10 | 10 | 10 × 38 = **380** | 10 × 2447 = **24,470** |
+| fec 10:5 | 5 | 5 × 38 = **190** | 5 × 2447 = **12,235** |
+| fec 5:3 | 3 | 3 × 38 = **114** | 3 × 2447 = **7,341** |
+
+These are the pure addmul1 cost; each call also includes a bzero of the
+shard buffer (first iteration).
+
+### 3. Cook amplification
+
+Every shard (data + parity) is cooked before send and de-cooked after
+receive. Per application packet: n/k cook + n/k de_cook calls.
+
+| Config | cook+de_cook/pkt | ns/pkt (x86 AVX2) |
+|---|---|---|
+| fec 20:10 | 1.5 × (351 + 230) = **872** | (no-fec: 351 + 230 = 581) |
+| fec 10:5 | 1.5 × (351 + 230) = **872** | |
+| fec 5:3 | 1.6 × (351 + 230) = **929** | |
+
+### Combined floor (x86_64 AVX2, fec 20:10, no loss)
+
+| Component | Per-app-pkt (ns) | Notes |
+|---|---|---|
+| RS encode | 380 | 10 × addmul1(1400B) |
+| Cook amplification | +291 | 0.5 extra cook+de_cook |
+| memcpy (blob input) | ~35 | 1400B at ~40 GB/s L1 |
+| memcpy (decode input) | ~53 | 1.5 × 1400B |
+| bzero (parity init) | ~18 | 0.5 × 1400B |
+| **Total overhead** | **~777** | on top of no-fec cost |
+
+No-fec per-packet cost: ~581 ns (cook + de_cook).
+FEC 20:10 per-packet cost: ~1358 ns (581 + 777).
+**Minimum FEC throughput ratio: 581 / 1358 = 43% of no-fec** (compute-bound).
+
+But wire amplification caps at 67% of no-fec, which is less restrictive.
+At low throughput (CPU-bound), the compute floor dominates. At high
+throughput (bandwidth-bound), the wire floor dominates.
+
+CI measured ~70% of no-fec (30% overhead), better than the compute floor
+predicts. This is because the throughput test is bandwidth-limited on
+loopback before hitting CPU saturation — the wire amplification floor
+(67%) is the binding constraint, and measured overhead (30%) is close to
+the theoretical 33%.
+
+### Decode worst case
+
+When r data shards are lost, decode reconstructs each via k addmul1 calls
+(`lib/fec.cpp:1060-1065`). Per batch: r × k addmul1 = same as encode.
+Per app packet: r addmul1(shard_len) — identical to encode cost.
+
+Worst-case round-trip (all r lost): encode + decode = 2r addmul1 per
+app packet = 760 ns/pkt on x86 AVX2 for fec 20:10.
+
+### Implication
+
+The current ~30% FEC overhead on CI loopback is within 10% of the
+information-theoretic floor (33% wire amplification). No further software
+optimization can meaningfully close this gap. On real networks with actual
+packet loss, the wire amplification is the cost of redundancy by design.
+
+## Not done (deliberately)
+
+**Scatter-gather RS encoder to eliminate blob_encode memcpy**: The
+`blob_encode_t::input()` memcpy (~1400B per packet) packs variable-length
+application packets into a contiguous buffer with interleaved 2-byte length
+headers, then slices the result into k equal-length shards for RS encode.
+This copy exists because shard boundaries depend on total batch size, which
+isn't known until the last packet arrives. Eliminating it would require the
+RS encoder to accept a scatter-gather (iovec-style) input instead of flat
+`char *data[k]` pointers. That means rewriting `fec_encode`'s inner loop
+(`lib/fec.cpp:940-945`) and `addmul1` to iterate over discontiguous chunks,
+adding branch overhead per chunk boundary inside the tightest SIMD loop in
+the system. The alternative — pre-positioning packets into a shard grid as
+they arrive — fails because the grid layout depends on the final batch size.
+Net effect: replaces a 1400B L1-resident memcpy (~35 ns) with scatter-gather
+bookkeeping of comparable cost, while adding complexity to the FEC core.
+
+**Zero-copy RS decode to eliminate fec_data memcpy**: The
+`fec_decode_manager_t::input()` memcpy (~1400B per shard) copies received
+shards into owned `fec_data[].buf` buffers because `fec_decode` modifies
+data in-place — it overwrites redundancy shard buffers with recovered data
+(`lib/fec.cpp:1061-1067`), then copies results back (`lib/fec.cpp:1072-1075`).
+Pointing RS decode directly at io_uring provided buffers would corrupt the
+kernel buffer ring (buffers must be recycled promptly or ring starvation
+occurs). For the recvfrom path, the receive buffer is stack-local and reused
+per callback. Making `fec_decode` write to separate output buffers instead
+of in-place would eliminate the input copy but add an identical output copy
+(the recovered data must still go somewhere). Net: zero gain, additional
+complexity in the 1997-era Vandermonde matrix math.
+
+**Auto-vectorization of scalar GF(2^8) fallback**: The scalar `addmul1`
+uses a 64KB lookup table indexed by runtime byte values. No compiler can
+auto-vectorize arbitrary table lookups — the SSSE3/NEON `PSHUFB`/`TBL`
+approach requires algebraic insight (nibble decomposition of GF multiplication)
+that is beyond compiler analysis. Documented in `lib/fec.cpp`.
+
+**-O3**: Tested, no measurable improvement. All hot paths are hand-written
+SIMD intrinsics or hardware CRC32C — the compiler's extra optimization
+passes have nothing to improve.
+
+**Alignment audit**: Unaligned SIMD loads/stores used throughout (correct
+for arbitrary buffer pointers). On both target architectures, unaligned
+accesses that don't cross cache line boundaries are free. Estimated
+impact of forced alignment: <1%.
+
+## Cross-architecture notes
+
+**x86_64** (N150, CI runners): Full SIMD coverage. AVX2 addmul1, SSE4.2
+CRC32C, SSE2/AVX2 XOR cook. io_uring multishot recv, sendmmsg batching.
+All optimizations apply.
+
+**ARMv8/AArch64** (Mediatek Filogic): NEON addmul1 (TBL), ARMv8-CRC
+CRC32C, NEON XOR cook. All three compute paths are vectorized. io_uring
+available if kernel 6.0+. Cross-compiled and QEMU-tested in CI; untested
+on real Filogic hardware.
+
+**PowerPC e500v2** (TL-WDR4900): SPE XOR only. addmul1 is scalar
+(SPE has no byte-level shuffle/permute equivalent to PSHUFB/TBL).
+CRC32C is software slicing-by-8 (no hardware CRC). No io_uring
+(older kernel). Expected real-hardware throughput: 50-150 Mbps no-fec,
+15-40 Mbps fec-20:10, limited by scalar addmul1.
+
+**MIPS 24Kc** (AR71xx OpenWrt targets): No useful SIMD. MIPS SIMD
+Architecture (MSA) is only on MIPS32r5+ (P5600, I6400), not 24Kc.
+All paths would be scalar. Build targets exist in makefile but are
+untested with current optimization work.
+
+**RISC-V RV64GCV**: Hypothetical future target. The V extension has
+`vrgather` which can implement GF(2^8) nibble-decomposition lookup
+(equivalent to PSHUFB/TBL), potentially vectorizing addmul1. This
+is the only other ISA besides x86/ARM that could accelerate FEC.
diff --git a/bench/bench_common.h b/bench/bench_common.h
new file mode 100644
index 0000000..5746227
--- /dev/null
+++ b/bench/bench_common.h
@@ -0,0 +1,34 @@
+#ifndef BENCH_COMMON_H
+#define BENCH_COMMON_H
+
+#include <stddef.h>
+
+/* gf type matches lib/fec.cpp for GF_BITS=8 */
+typedef unsigned char gf;
+
+/* Exposed by lib/fec.cpp when compiled with -DBENCH_EXPOSE_INTERNALS */
+extern "C++" void bench_addmul1(gf *dst, gf *src, gf c, int sz);
+
+/* Exposed by packet_cook.cpp when compiled with -DBENCH_EXPOSE_INTERNALS */
+extern "C++" void bench_xor_tile(char *data, int len, const char *tile, int tile_len);
+extern "C++" int bench_cook_vec_width();
+extern "C++" const char *bench_xor_tile_impl();
+
+/* Exposed by lib/fec.cpp when compiled with -DBENCH_EXPOSE_INTERNALS */
+extern "C++" const char *bench_addmul1_impl();
+
+/* Packet sizes representative of real traffic */
+static const size_t bench_sizes[] = { 64, 256, 1024, 1500 };
+static const int bench_sizes_count = sizeof(bench_sizes) / sizeof(bench_sizes[0]);
+
+/* Registration functions called from bench_main.cpp */
+void register_fec_benchmarks(void *bench_ptr);
+void register_crc32_benchmarks(void *bench_ptr);
+void register_packet_benchmarks(void *bench_ptr);
+
+/* Registration functions called from test_main.cpp */
+int run_fec_tests();
+int run_crc32_tests();
+int run_packet_tests();
+
+#endif
diff --git a/bench/bench_crc32.cpp b/bench/bench_crc32.cpp
new file mode 100644
index 0000000..2a307b2
--- /dev/null
+++ b/bench/bench_crc32.cpp
@@ -0,0 +1,61 @@
+#include "nanobench.h"
+#include "bench_common.h"
+#include "crc32c.h"
+#include "crc32/Crc32.h"
+#include <cstdlib>
+#include <string>
+
+void register_crc32_benchmarks(void *bench_ptr) {
+    auto &bench = *static_cast<ankerl::nanobench::Bench *>(bench_ptr);
+
+    /* Fill a buffer with pseudo-random data */
+    static char buf[1500];
+    for (int i = 0; i < 1500; i++)
+        buf[i] = (char)(rand() & 0xFF);
+
+    /* --- Old CRC32 (zlib polynomial) baseline --- */
+    for (int i = 0; i < bench_sizes_count; i++) {
+        size_t sz = bench_sizes[i];
+        std::string name = "crc32_old/" + std::to_string(sz) + "B";
+
+        bench.run(name, [sz]() {
+            auto r = crc32_fast(buf, sz);
+            ankerl::nanobench::doNotOptimizeAway(r);
+        });
+    }
+
+    /* --- CRC32C software --- */
+    for (int i = 0; i < bench_sizes_count; i++) {
+        size_t sz = bench_sizes[i];
+        std::string name = "crc32c_sw/" + std::to_string(sz) + "B";
+
+        bench.run(name, [sz]() {
+            auto r = crc32c_sw(buf, sz);
+            ankerl::nanobench::doNotOptimizeAway(r);
+        });
+    }
+
+    /* --- CRC32C hardware (may be same as sw if no hw support) --- */
+    if (crc32c_has_hw()) {
+        for (int i = 0; i < bench_sizes_count; i++) {
+            size_t sz = bench_sizes[i];
+            std::string name = "crc32c_hw/" + std::to_string(sz) + "B";
+
+            bench.run(name, [sz]() {
+                auto r = crc32c_hw(buf, sz);
+                ankerl::nanobench::doNotOptimizeAway(r);
+            });
+        }
+    }
+
+    /* --- CRC32C dispatched (production path) --- */
+    for (int i = 0; i < bench_sizes_count; i++) {
+        size_t sz = bench_sizes[i];
+        std::string name = "crc32c/" + std::to_string(sz) + "B";
+
+        bench.run(name, [sz]() {
+            auto r = crc32c(buf, sz);
+            ankerl::nanobench::doNotOptimizeAway(r);
+        });
+    }
+}
diff --git a/bench/bench_fec.cpp b/bench/bench_fec.cpp
new file mode 100644
index 0000000..6316956
--- /dev/null
+++ b/bench/bench_fec.cpp
@@ -0,0 +1,93 @@
+#include "nanobench.h"
+#include "bench_common.h"
+#include "lib/rs.h"
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+static void fill_random(char *buf, int sz) {
+    for (int i = 0; i < sz; i++)
+        buf[i] = (char)(rand() & 0xFF);
+}
+
+void register_fec_benchmarks(void *bench_ptr) {
+    auto &bench = *static_cast<ankerl::nanobench::Bench *>(bench_ptr);
+
+    /* GF tables are initialized inside fec_new; force init via a dummy allocation */
+    { void *d = fec_new(2, 3); fec_free(d); }
+
+    /* --- addmul1 microbenchmarks --- */
+    for (int i = 0; i < bench_sizes_count; i++) {
+        int sz = (int)bench_sizes[i];
+        std::string name = "addmul1/" + std::to_string(sz) + "B";
+
+        bench.run(name, [sz]() {
+            static gf dst[1500], src[1500];
+            bench_addmul1(dst, src, 0x53, sz);
+            ankerl::nanobench::doNotOptimizeAway(dst[0]);
+        });
+    }
+
+    /* --- rs_encode2 --- */
+    struct { int k; int n; const char *label; } encode_configs[] = {
+        {5, 8, "5/8"}, {10, 15, "10/15"}
+    };
+
+    for (auto &cfg : encode_configs) {
+        std::string name = std::string("rs_encode/k") + cfg.label + "/1500B";
+        int k = cfg.k, n = cfg.n;
+
+        /* Pre-allocate outside the timed loop */
+        char **data = (char **)calloc(n, sizeof(char *));
+        for (int j = 0; j < n; j++) {
+            data[j] = (char *)calloc(1, 1500);
+        }
+        for (int j = 0; j < k; j++)
+            fill_random(data[j], 1500);
+
+        bench.run(name, [k, n, data]() {
+            rs_encode2(k, n, data, 1500);
+            ankerl::nanobench::doNotOptimizeAway(data[k][0]);
+        });
+
+        for (int j = 0; j < n; j++) free(data[j]);
+        free(data);
+    }
+
+    /* --- rs_decode2 --- */
+    for (auto &cfg : encode_configs) {
+        std::string name = std::string("rs_decode/k") + cfg.label + "/1500B";
+        int k = cfg.k, n = cfg.n;
+        int redundant = n - k;
+
+        /* Prepare encoded data once */
+        char **orig = (char **)calloc(n, sizeof(char *));
+        for (int j = 0; j < n; j++)
+            orig[j] = (char *)calloc(1, 1500);
+        for (int j = 0; j < k; j++)
+            fill_random(orig[j], 1500);
+        rs_encode2(k, n, orig, 1500);
+
+        /* Working copy for each decode iteration */
+        char **data = (char **)calloc(n, sizeof(char *));
+        char **bufs = (char **)calloc(n, sizeof(char *));
+        for (int j = 0; j < n; j++)
+            bufs[j] = (char *)calloc(1, 1500);
+
+        bench.run(name, [k, n, redundant, orig, data, bufs]() {
+            /* Reset working copy from originals */
+            for (int j = 0; j < n; j++)
+                memcpy(bufs[j], orig[j], 1500);
+
+            /* Simulate losing the first 'redundant' data packets */
+            for (int j = 0; j < n; j++)
+                data[j] = (j < redundant) ? NULL : bufs[j];
+
+            rs_decode2(k, n, data, 1500);
+            ankerl::nanobench::doNotOptimizeAway(data[0][0]);
+        });
+
+        for (int j = 0; j < n; j++) { free(orig[j]); free(bufs[j]); }
+        free(orig); free(data); free(bufs);
+    }
+}
diff --git a/bench/bench_main.cpp b/bench/bench_main.cpp
new file mode 100644
index 0000000..e8b8bc9
--- /dev/null
+++ b/bench/bench_main.cpp
@@ -0,0 +1,63 @@
+#define ANKERL_NANOBENCH_IMPLEMENT
+#include "nanobench.h"
+#include "bench_common.h"
+#include "lib/rs.h"
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <vector>
+
+int main(int argc, char *argv[]) {
+    bool json_output = false;
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--json") == 0)
+            json_output = true;
+    }
+
+    /* Force FEC init so addmul1 dispatch is resolved */
+    { void *d = fec_new(2, 3); fec_free(d); }
+
+    printf("SIMD: addmul1=%s  xor_cook=%s  vec_width=%d\n",
+        bench_addmul1_impl(), bench_xor_tile_impl(), bench_cook_vec_width());
+
+    ankerl::nanobench::Bench bench;
+    bench.title("UDPspeeder").warmup(3).epochs(21).relative(false);
+
+    register_fec_benchmarks(&bench);
+    register_crc32_benchmarks(&bench);
+    register_packet_benchmarks(&bench);
+
+    /* Emit stability warnings for noisy benchmarks */
+    {
+        auto results = bench.results();
+        for (size_t i = 0; i < results.size(); i++) {
+            double mdape = results[i].medianAbsolutePercentError(
+                ankerl::nanobench::Result::Measure::elapsed);
+            if (mdape > 0.05) {
+                fprintf(stderr, "WARNING: %s has MdAPE %.1f%% (>5%%)\n",
+                    results[i].config().mBenchmarkName.c_str(), mdape * 100.0);
+            }
+        }
+    }
+
+    if (json_output) {
+        /* github-action-benchmark customSmallerIsBetter format
+         * Mustache templates can't do math, so we extract results manually */
+        std::ofstream out("bench_results.json");
+        auto results = bench.results();
+        out << "[\n";
+        for (size_t i = 0; i < results.size(); i++) {
+            double ns = results[i].median(ankerl::nanobench::Result::Measure::elapsed) * 1e9;
+            out << "  {\n"
+                << "    \"name\": \"" << results[i].config().mBenchmarkName << "\",\n"
+                << "    \"unit\": \"ns/op\",\n"
+                << "    \"value\": " << ns << "\n"
+                << "  }";
+            if (i + 1 < results.size()) out << ",";
+            out << "\n";
+        }
+        out << "]\n";
+    }
+
+    return 0;
+}
diff --git a/bench/bench_packet.cpp b/bench/bench_packet.cpp
new file mode 100644
index 0000000..27a1809
--- /dev/null
+++ b/bench/bench_packet.cpp
@@ -0,0 +1,106 @@
+#include "nanobench.h"
+#include "bench_common.h"
+#include "packet_cook.h"
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+/* Stubs for packet_cook.cpp dependencies — production uses common.cpp */
+void get_fake_random_chars(char *s, int len) {
+    for (int i = 0; i < len; i++)
+        s[i] = (char)(rand() & 0xFF);
+}
+
+int random_between(unsigned int a, unsigned int b) {
+    if (a == b) return (int)a;
+    return (int)(a + (unsigned int)rand() % (b + 1 - a));
+}
+
+static cook_ctx_t make_ctx(int checksum, int obscure, int xor_enc) {
+    cook_ctx_t ctx = {};
+    strcpy(ctx.key, "benchmarkkey1234");
+    cook_ctx_prepare_key(&ctx);
+    ctx.iv_min = 16;
+    ctx.iv_max = 16;
+    ctx.disable_checksum = !checksum;
+    ctx.disable_obscure = !obscure;
+    ctx.disable_xor = !xor_enc;
+    return ctx;
+}
+
+void register_packet_benchmarks(void *bench_ptr) {
+    auto &bench = *static_cast<ankerl::nanobench::Bench *>(bench_ptr);
+
+    /* Full pipeline: do_cook at all sizes */
+    for (int i = 0; i < bench_sizes_count; i++) {
+        int sz = (int)bench_sizes[i];
+        std::string name = "do_cook/" + std::to_string(sz) + "B";
+        cook_ctx_t ctx = make_ctx(1, 1, 1);
+
+        bench.run(name, [sz, &ctx]() {
+            static char buf[4096];
+            memset(buf, 0xAB, sz);
+            int len = sz;
+            do_cook(&ctx, buf, len);
+            ankerl::nanobench::doNotOptimizeAway(buf[0]);
+        });
+    }
+
+    /* Full pipeline: de_cook at all sizes */
+    for (int i = 0; i < bench_sizes_count; i++) {
+        int sz = (int)bench_sizes[i];
+        std::string name = "de_cook/" + std::to_string(sz) + "B";
+        cook_ctx_t ctx = make_ctx(1, 1, 1);
+
+        /* Prepare a cooked buffer */
+        static char cooked[4096];
+        memset(cooked, 0xAB, sz);
+        int cooked_len = sz;
+        do_cook(&ctx, cooked, cooked_len);
+        int saved_len = cooked_len;
+
+        bench.run(name, [saved_len, &ctx]() {
+            static char buf[4096];
+            memcpy(buf, cooked, saved_len);
+            int len = saved_len;
+            de_cook(&ctx, buf, len);
+            ankerl::nanobench::doNotOptimizeAway(buf[0]);
+        });
+    }
+
+    /* Component: checksum only at 1500B */
+    {
+        cook_ctx_t ctx = make_ctx(1, 0, 0);
+        bench.run("cook_crc32_only/1500B", [&ctx]() {
+            static char buf[4096];
+            memset(buf, 0xAB, 1500);
+            int len = 1500;
+            do_cook(&ctx, buf, len);
+            ankerl::nanobench::doNotOptimizeAway(buf[0]);
+        });
+    }
+
+    /* Component: obscure only at 1500B */
+    {
+        cook_ctx_t ctx = make_ctx(0, 1, 0);
+        bench.run("cook_obscure_only/1500B", [&ctx]() {
+            static char buf[4096];
+            memset(buf, 0xAB, 1500);
+            int len = 1500;
+            do_cook(&ctx, buf, len);
+            ankerl::nanobench::doNotOptimizeAway(buf[0]);
+        });
+    }
+
+    /* Component: xor only at 1500B */
+    {
+        cook_ctx_t ctx = make_ctx(0, 0, 1);
+        bench.run("cook_xor_only/1500B", [&ctx]() {
+            static char buf[4096];
+            memset(buf, 0xAB, 1500);
+            int len = 1500;
+            do_cook(&ctx, buf, len);
+            ankerl::nanobench::doNotOptimizeAway(buf[0]);
+        });
+    }
+}
diff --git a/bench/interop.sh b/bench/interop.sh
new file mode 100755
index 0000000..803d114
--- /dev/null
+++ b/bench/interop.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+# bench/interop.sh — Cross-architecture interop test
+#
+# Runs a UDPspeeder tunnel between two (possibly different-arch) binaries
+# and verifies data integrity. Both binaries can be prefixed with QEMU.
+#
+# Usage: ./bench/interop.sh --server-cmd CMD --client-cmd CMD [options]
+#   --server-cmd CMD   Command to run server (may include QEMU prefix)
+#   --client-cmd CMD   Command to run client (may include QEMU prefix)
+#   --fec X:Y          FEC parameter (default: disabled)
+#   --disable-fec      Explicitly disable FEC (default)
+#   --key KEY          Encryption key
+#   --packets N        Number of packets to send (default: 200)
+#   --label LABEL      Label for output (default: "interop")
+
+set -euo pipefail
+
+SERVER_CMD=""
+CLIENT_CMD=""
+FEC_ARGS="--disable-fec"
+KEY_ARGS=""
+PACKETS=200
+LABEL="interop"
+LOG_LEVEL=4
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --server-cmd) SERVER_CMD="$2"; shift 2 ;;
+        --client-cmd) CLIENT_CMD="$2"; shift 2 ;;
+        --fec) FEC_ARGS="-f $2"; shift 2 ;;
+        --disable-fec) FEC_ARGS="--disable-fec"; shift ;;
+        --key) KEY_ARGS="-k $2"; shift 2 ;;
+        --packets) PACKETS="$2"; shift 2 ;;
+        --label) LABEL="$2"; shift 2 ;;
+        --log-level) LOG_LEVEL="$2"; shift 2 ;;
+        *) echo "Unknown option: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [[ -z "$SERVER_CMD" || -z "$CLIENT_CMD" ]]; then
+    echo "Error: --server-cmd and --client-cmd are required" >&2
+    exit 1
+fi
+
+PORT_TUNNEL=20010
+PORT_APP=20011
+PORT_CLIENT=20012
+
+RECV_RESULT=$(mktemp)
+SERVER_LOG=$(mktemp)
+CLIENT_LOG=$(mktemp)
+
+cleanup() {
+    local pids
+    pids=$(jobs -p 2>/dev/null) || true
+    if [[ -n "$pids" ]]; then
+        kill $pids 2>/dev/null || true
+        wait $pids 2>/dev/null || true
+    fi
+    rm -f "$RECV_RESULT" "$SERVER_LOG" "$CLIENT_LOG"
+}
+trap cleanup EXIT
+
+dump_logs() {
+    echo "  --- SERVER LOG (last 80 lines) ---" >&2
+    tail -80 "$SERVER_LOG" >&2 2>/dev/null || true
+    echo "  --- CLIENT LOG (last 80 lines) ---" >&2
+    tail -80 "$CLIENT_LOG" >&2 2>/dev/null || true
+    echo "  --- END LOGS ---" >&2
+}
+
+# Receiver: validate each packet's content
+# Packet format: 4-byte big-endian seq + 1396 bytes of (seq & 0xFF)
+python3 -c "
+import socket, struct, sys
+
+sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+sock.bind(('127.0.0.1', $PORT_APP))
+sock.settimeout(10)
+
+valid = 0
+invalid = 0
+
+try:
+    while True:
+        data = sock.recv(65535)
+        sock.settimeout(3)  # shorter timeout after first packet
+        if len(data) < 4:
+            invalid += 1
+            continue
+        seq = struct.unpack('>I', data[:4])[0]
+        fill = seq & 0xFF
+        expected = data[:4] + bytes([fill]) * (len(data) - 4)
+        if data == expected:
+            valid += 1
+        else:
+            invalid += 1
+            sys.stderr.write('CORRUPT seq=%d len=%d\n' % (seq, len(data)))
+            sys.stderr.flush()
+except socket.timeout:
+    pass
+
+print('%d %d' % (valid, invalid))
+" > "$RECV_RESULT" 2>&1 &
+RECV_PID=$!
+
+# Start tunnel (io_uring disabled — QEMU can't translate those syscalls)
+UDPSPEEDER_NO_URING=1 $SERVER_CMD \
+    -s -l 127.0.0.1:$PORT_TUNNEL -r 127.0.0.1:$PORT_APP \
+    $FEC_ARGS $KEY_ARGS --log-level $LOG_LEVEL >"$SERVER_LOG" 2>&1 &
+
+UDPSPEEDER_NO_URING=1 $CLIENT_CMD \
+    -c -l 127.0.0.1:$PORT_CLIENT -r 127.0.0.1:$PORT_TUNNEL \
+    $FEC_ARGS $KEY_ARGS --log-level $LOG_LEVEL >"$CLIENT_LOG" 2>&1 &
+
+sleep 2  # let QEMU-emulated binaries start
+
+# Sender: N packets, each 1400 bytes with verifiable content
+python3 -c "
+import socket, struct, time
+
+sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+for seq in range($PACKETS):
+    header = struct.pack('>I', seq)
+    fill = bytes([seq & 0xFF]) * 1396
+    sock.sendto(header + fill, ('127.0.0.1', $PORT_CLIENT))
+    time.sleep(0.001)
+"
+
+echo "  [$LABEL] sender done ($PACKETS packets), waiting for receiver..." >&2
+
+wait $RECV_PID 2>/dev/null || true
+
+# Parse results
+RESULT=$(cat "$RECV_RESULT")
+
+VALID=$(echo "$RESULT" | tail -1 | awk '{print $1}')
+INVALID=$(echo "$RESULT" | tail -1 | awk '{print $2}')
+VALID=${VALID:-0}
+INVALID=${INVALID:-0}
+
+echo "  [$LABEL] valid=$VALID invalid=$INVALID sent=$PACKETS" >&2
+
+if [[ "$INVALID" -ne 0 ]]; then
+    echo "FAIL [$LABEL]: $INVALID corrupted packets" >&2
+    dump_logs
+    exit 1
+fi
+
+if [[ "$VALID" -eq 0 ]]; then
+    echo "FAIL [$LABEL]: no packets received" >&2
+    dump_logs
+    exit 1
+fi
+
+MIN_EXPECTED=$(( PACKETS / 2 ))
+if [[ "$VALID" -lt "$MIN_EXPECTED" ]]; then
+    echo "FAIL [$LABEL]: only $VALID/$PACKETS packets (expected >=$MIN_EXPECTED)" >&2
+    dump_logs
+    exit 1
+fi
+
+echo "PASS [$LABEL]: $VALID/$PACKETS packets, 0 corrupt"
diff --git a/bench/nanobench.h b/bench/nanobench.h
new file mode 100644
index 0000000..127240d
--- /dev/null
+++ b/bench/nanobench.h
@@ -0,0 +1,3484 @@
+//  __   _ _______ __   _  _____  ______  _______ __   _ _______ _     _
+//  | \  | |_____| | \  | |     | |_____] |______ | \  | |       |_____|
+//  |  \_| |     | |  \_| |_____| |_____] |______ |  \_| |_____  |     |
+//
+// Microbenchmark framework for C++11/14/17/20
+// https://github.com/martinus/nanobench
+//
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2019-2023 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ANKERL_NANOBENCH_H_INCLUDED
+#define ANKERL_NANOBENCH_H_INCLUDED
+
+// see https://semver.org/
+#define ANKERL_NANOBENCH_VERSION_MAJOR 4  // incompatible API changes
+#define ANKERL_NANOBENCH_VERSION_MINOR 3  // backwards-compatible changes
+#define ANKERL_NANOBENCH_VERSION_PATCH 11 // backwards-compatible bug fixes
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// public facing api - as minimal as possible
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <chrono>        // high_resolution_clock
+#include <cstring>       // memcpy
+#include <iosfwd>        // for std::ostream* custom output target in Config
+#include <string>        // all names
+#include <unordered_map> // holds context information of results
+#include <vector>        // holds all results
+
+#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
+
+#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
+#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
+#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
+#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
+#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
+
+#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
+#    define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
+#else
+#    define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
+#endif
+
+#if defined(__clang__)
+#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
+        _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
+#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
+#else
+#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
+#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
+#endif
+
+#if defined(__GNUC__)
+#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
+#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
+#else
+#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
+#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
+#endif
+
+#if defined(ANKERL_NANOBENCH_LOG_ENABLED)
+#    include <iostream>
+#    define ANKERL_NANOBENCH_LOG(x)                                                 \
+        do {                                                                        \
+            std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
+        } while (0)
+#else
+#    define ANKERL_NANOBENCH_LOG(x) \
+        do {                        \
+        } while (0)
+#endif
+
+#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
+#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
+#    include <linux/version.h>
+#    if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
+// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3
+// PERF_FLAG_FD_CLOEXEC since kernel 3.14
+#        undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
+#        define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
+#    endif
+#endif
+
+#if defined(__clang__)
+#    define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
+#else
+#    define ANKERL_NANOBENCH_NO_SANITIZE(...)
+#endif
+
+#if defined(_MSC_VER)
+#    define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
+#else
+#    define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
+#endif
+
+// workaround missing "is_trivially_copyable" in g++ < 5.0
+// See https://stackoverflow.com/a/31798726/48181
+#if defined(__GNUC__) && __GNUC__ < 5
+#    define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+#else
+#    define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
+#endif
+
+// noexcept may be missing for std::string.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58265
+#define ANKERL_NANOBENCH_PRIVATE_NOEXCEPT_STRING_MOVE() std::is_nothrow_move_assignable<std::string>::value
+
+// declarations ///////////////////////////////////////////////////////////////////////////////////
+
+namespace ankerl {
+namespace nanobench {
+
+using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
+                               std::chrono::steady_clock>::type;
+class Bench;
+struct Config;
+class Result;
+class Rng;
+class BigO;
+
+/**
+ * @brief Renders output from a mustache-like template and benchmark results.
+ *
+ * The templating facility here is heavily inspired by [mustache - logic-less templates](https://mustache.github.io/).
+ * It adds a few more features that are necessary to get all of the captured data out of nanobench. Please read the
+ * excellent [mustache manual](https://mustache.github.io/mustache.5.html) to see what this is all about.
+ *
+ * nanobench output has two nested layers, *result* and *measurement*.  Here is a hierarchy of the allowed tags:
+ *
+ * * `{{#result}}` Marks the begin of the result layer. Whatever comes after this will be instantiated as often as
+ *   a benchmark result is available. Within it, you can use these tags:
+ *
+ *    * `{{title}}` See Bench::title.
+ *
+ *    * `{{name}}` Benchmark name, usually directly provided with Bench::run, but can also be set with Bench::name.
+ *
+ *    * `{{unit}}` Unit, e.g. `byte`. Defaults to `op`, see Bench::unit.
+ *
+ *    * `{{batch}}` Batch size, see Bench::batch.
+ *
+ *    * `{{complexityN}}` Value used for asymptotic complexity calculation. See Bench::complexityN.
+ *
+ *    * `{{epochs}}` Number of epochs, see Bench::epochs.
+ *
+ *    * `{{clockResolution}}` Accuracy of the clock, i.e. what's the smallest time possible to measure with the clock.
+ *      For modern systems, this can be around 20 ns. This value is automatically determined by nanobench at the first
+ *      benchmark that is run, and used as a static variable throughout the application's runtime.
+ *
+ *    * `{{clockResolutionMultiple}}` Configuration multiplier for `clockResolution`. See Bench::clockResolutionMultiple.
+ *      This is the target runtime for each measurement (epoch). That means the more accurate your clock is, the faster
+ *      will be the benchmark. Basing the measurement's runtime on the clock resolution is the main reason why nanobench is so fast.
+ *
+ *    * `{{maxEpochTime}}` Configuration for a maximum time each measurement (epoch) is allowed to take. Note that at least
+ *      a single iteration will be performed, even when that takes longer than maxEpochTime. See Bench::maxEpochTime.
+ *
+ *    * `{{minEpochTime}}` Minimum epoch time, defaults to 1ms. See Bench::minEpochTime.
+ *
+ *    * `{{minEpochIterations}}` See Bench::minEpochIterations.
+ *
+ *    * `{{epochIterations}}` See Bench::epochIterations.
+ *
+ *    * `{{warmup}}` Number of iterations used before measuring starts. See Bench::warmup.
+ *
+ *    * `{{relative}}` True or false, depending on the setting you have used. See Bench::relative.
+ *
+ *    * `{{context(variableName)}}` See Bench::context.
+ *
+ *    Apart from these tags, it is also possible to use some mathematical operations on the measurement data. The operations
+ *    are of the form `{{command(name)}}`.  Currently `name` can be one of `elapsed`, `iterations`. If performance counters
+ *    are available (currently only on current Linux systems), you also have `pagefaults`, `cpucycles`,
+ *    `contextswitches`, `instructions`, `branchinstructions`, and `branchmisses`. All the measures (except `iterations`) are
+ *    provided for a single iteration (so `elapsed` is the time a single iteration took). The following tags are available:
+ *
+ *    * `{{median(<name>)}}` Calculate median of a measurement data set, e.g. `{{median(elapsed)}}`.
+ *
+ *    * `{{average(<name>)}}` Average (mean) calculation.
+ *
+ *    * `{{medianAbsolutePercentError(<name>)}}` Calculates MdAPE, the Median Absolute Percentage Error. The MdAPE is an excellent
+ *      metric for the variation of measurements. It is more robust to outliers than the
+ *      [Mean absolute percentage error (M-APE)](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error).
+ *      @f[
+ *       \mathrm{MdAPE}(e) = \mathrm{med}\{| \frac{e_i - \mathrm{med}\{e\}}{e_i}| \}
+ *      @f]
+ *      E.g. for *elapsed*: First, @f$ \mathrm{med}\{e\} @f$ calculates the median by sorting and then taking the middle element
+ *      of all *elapsed* measurements. This is used to calculate the absolute percentage
+ *      error to this median for each measurement, as in  @f$ | \frac{e_i - \mathrm{med}\{e\}}{e_i}| @f$. All these results
+ *      are sorted, and the middle value is chosen as the median absolute percent error.
+ *
+ *      This measurement is a bit hard to interpret, but it is very robust against outliers. E.g. a value of 5% means that half of the
+ *      measurements deviate less than 5% from the median, and the other deviate more than 5% from the median.
+ *
+ *    * `{{sum(<name>)}}` Sum of all the measurements. E.g. `{{sum(iterations)}}` will give you the total number of iterations
+*        measured in this benchmark.
+ *
+ *    * `{{minimum(<name>)}}` Minimum of all measurements.
+ *
+ *    * `{{maximum(<name>)}}` Maximum of all measurements.
+ *
+ *    * `{{sumProduct(<first>, <second>)}}` Calculates the sum of the products of corresponding measures:
+ *      @f[
+ *          \mathrm{sumProduct}(a,b) = \sum_{i=1}^{n}a_i\cdot b_i
+ *      @f]
+ *      E.g. to calculate total runtime of the benchmark, you multiply iterations with elapsed time for each measurement, and
+ *      sum these results up:
+ *      `{{sumProduct(iterations, elapsed)}}`.
+ *
+ *    * `{{#measurement}}` To access individual measurement results, open the begin tag for measurements.
+ *
+ *       * `{{elapsed}}` Average elapsed wall clock time per iteration, in seconds.
+ *
+ *       * `{{iterations}}` Number of iterations in the measurement. The number of iterations will fluctuate due
+ *         to some applied randomness, to enhance accuracy.
+ *
+ *       * `{{pagefaults}}` Average number of pagefaults per iteration.
+ *
+ *       * `{{cpucycles}}` Average number of CPU cycles processed per iteration.
+ *
+ *       * `{{contextswitches}}` Average number of context switches per iteration.
+ *
+ *       * `{{instructions}}` Average number of retired instructions per iteration.
+ *
+ *       * `{{branchinstructions}}` Average number of branches executed per iteration.
+ *
+ *       * `{{branchmisses}}` Average number of branches that were missed per iteration.
+ *
+ *    * `{{/measurement}}` Ends the measurement tag.
+ *
+ * * `{{/result}}` Marks the end of the result layer. This is the end marker for the template part that will be instantiated
+ *   for each benchmark result.
+ *
+ *
+ *  For the layer tags *result* and *measurement* you additionally can use these special markers:
+ *
+ *  * ``{{#-first}}`` - Begin marker of a template that will be instantiated *only for the first* entry in the layer. Use is only
+ *    allowed between the begin and end marker of the layer. So between ``{{#result}}`` and ``{{/result}}``, or between
+ *    ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-first}}``.
+ *
+ *  * ``{{^-first}}`` - Begin marker of a template that will be instantiated *for each except the first* entry in the layer. This,
+ *    this is basically the inversion of ``{{#-first}}``. Use is only allowed between the begin and end marker of the layer.
+ *    So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``.
+ *
+ *  * ``{{/-first}}`` - End marker for either ``{{#-first}}`` or ``{{^-first}}``.
+ *
+ *  * ``{{#-last}}`` - Begin marker of a template that will be instantiated *only for the last* entry in the layer. Use is only
+ *    allowed between the begin and end marker of the layer. So between ``{{#result}}`` and ``{{/result}}``, or between
+ *    ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-last}}``.
+ *
+ *  * ``{{^-last}}`` - Begin marker of a template that will be instantiated *for each except the last* entry in the layer. This,
+ *    this is basically the inversion of ``{{#-last}}``. Use is only allowed between the begin and end marker of the layer.
+ *    So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``.
+ *
+ *  * ``{{/-last}}`` - End marker for either ``{{#-last}}`` or ``{{^-last}}``.
+ *
+   @verbatim embed:rst
+
+   For an overview of all the possible data you can get out of nanobench, please see the tutorial at :ref:`tutorial-template-json`.
+
+   The templates that ship with nanobench are:
+
+   * :cpp:func:`templates::csv() <ankerl::nanobench::templates::csv()>`
+   * :cpp:func:`templates::json() <ankerl::nanobench::templates::json()>`
+   * :cpp:func:`templates::htmlBoxplot() <ankerl::nanobench::templates::htmlBoxplot()>`
+   * :cpp:func:`templates::pyperf() <ankerl::nanobench::templates::pyperf()>`
+
+   @endverbatim
+ *
+ * @param mustacheTemplate The template.
+ * @param bench Benchmark, containing all the results.
+ * @param out Output for the generated output.
+ */
+void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
+void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
+
+/**
+ * Same as render(char const* mustacheTemplate, Bench const& bench, std::ostream& out), but for when
+ * you only have results available.
+ *
+ * @param mustacheTemplate The template.
+ * @param results All the results to be used for rendering.
+ * @param out Output for the generated output.
+ */
+void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
+void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
+
+// Contains mustache-like templates
+namespace templates {
+
+/*!
+  @brief CSV data for the benchmark results.
+
+  Generates a comma-separated values dataset. First line is the header, each following line is a summary of each benchmark run.
+
+  @verbatim embed:rst
+  See the tutorial at :ref:`tutorial-template-csv` for an example.
+  @endverbatim
+ */
+char const* csv() noexcept;
+
+/*!
+  @brief HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an example output.
+
+  The output uses only the elapsed wall clock time, and displays each epoch as a single dot.
+  @verbatim embed:rst
+  See the tutorial at :ref:`tutorial-template-html` for an example.
+  @endverbatim
+
+  @see also ankerl::nanobench::render()
+ */
+char const* htmlBoxplot() noexcept;
+
+/*!
+ @brief Output in pyperf compatible JSON format, which can be used for more analyzation.
+ @verbatim embed:rst
+ See the tutorial at :ref:`tutorial-template-pyperf` for an example how to further analyze the output.
+ @endverbatim
+ */
+char const* pyperf() noexcept;
+
+/*!
+  @brief Template to generate JSON data.
+
+  The generated JSON data contains *all* data that has been generated. All times are as double values, in seconds. The output can get
+  quite large.
+  @verbatim embed:rst
+  See the tutorial at :ref:`tutorial-template-json` for an example.
+  @endverbatim
+ */
+char const* json() noexcept;
+
+} // namespace templates
+
+namespace detail {
+
+template <typename T>
+struct PerfCountSet;
+
+class IterationLogic;
+class PerformanceCounters;
+
+#if ANKERL_NANOBENCH(PERF_COUNTERS)
+class LinuxPerformanceCounters;
+#endif
+
+} // namespace detail
+} // namespace nanobench
+} // namespace ankerl
+
+// definitions ////////////////////////////////////////////////////////////////////////////////////
+
+namespace ankerl {
+namespace nanobench {
+namespace detail {
+
+template <typename T>
+struct PerfCountSet {
+    T pageFaults{};
+    T cpuCycles{};
+    T contextSwitches{};
+    T instructions{};
+    T branchInstructions{};
+    T branchMisses{};
+};
+
+} // namespace detail
+
+ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
+struct Config {
+    // actual benchmark config
+    std::string mBenchmarkTitle = "benchmark";                               // NOLINT(misc-non-private-member-variables-in-classes)
+    std::string mBenchmarkName = "noname";                                   // NOLINT(misc-non-private-member-variables-in-classes)
+    std::string mUnit = "op";                                                // NOLINT(misc-non-private-member-variables-in-classes)
+    double mBatch = 1.0;                                                     // NOLINT(misc-non-private-member-variables-in-classes)
+    double mComplexityN = -1.0;                                              // NOLINT(misc-non-private-member-variables-in-classes)
+    size_t mNumEpochs = 11;                                                  // NOLINT(misc-non-private-member-variables-in-classes)
+    size_t mClockResolutionMultiple = static_cast<size_t>(1000);             // NOLINT(misc-non-private-member-variables-in-classes)
+    std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100); // NOLINT(misc-non-private-member-variables-in-classes)
+    std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1);   // NOLINT(misc-non-private-member-variables-in-classes)
+    uint64_t mMinEpochIterations{1};                                         // NOLINT(misc-non-private-member-variables-in-classes)
+    // If not 0, run *exactly* these number of iterations per epoch.
+    uint64_t mEpochIterations{0};                                          // NOLINT(misc-non-private-member-variables-in-classes)
+    uint64_t mWarmup = 0;                                                  // NOLINT(misc-non-private-member-variables-in-classes)
+    std::ostream* mOut = nullptr;                                          // NOLINT(misc-non-private-member-variables-in-classes)
+    std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1}; // NOLINT(misc-non-private-member-variables-in-classes)
+    std::string mTimeUnitName = "ns";                                      // NOLINT(misc-non-private-member-variables-in-classes)
+    bool mShowPerformanceCounters = true;                                  // NOLINT(misc-non-private-member-variables-in-classes)
+    bool mIsRelative = false;                                              // NOLINT(misc-non-private-member-variables-in-classes)
+    std::unordered_map<std::string, std::string> mContext{};               // NOLINT(misc-non-private-member-variables-in-classes)
+
+    Config();
+    ~Config();
+    Config& operator=(Config const& other);
+    Config& operator=(Config&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
+    Config(Config const& other);
+    Config(Config&& other) noexcept;
+};
+ANKERL_NANOBENCH(IGNORE_PADDED_POP)
+
+// Result returned after a benchmark has finished. Can be used as a baseline for relative().
+ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
+class Result {
+public:
+    enum class Measure : size_t {
+        elapsed,
+        iterations,
+        pagefaults,
+        cpucycles,
+        contextswitches,
+        instructions,
+        branchinstructions,
+        branchmisses,
+        _size
+    };
+
+    explicit Result(Config benchmarkConfig);
+
+    ~Result();
+    Result& operator=(Result const& other);
+    Result& operator=(Result&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
+    Result(Result const& other);
+    Result(Result&& other) noexcept;
+
+    // adds new measurement results
+    // all values are scaled by iters (except iters...)
+    void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
+
+    ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
+
+    ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
+    ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
+    ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
+    ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const* variableName) const;
+    ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const& variableName) const;
+
+    ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
+    ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
+
+    // Finds string, if not found, returns _size.
+    static Measure fromString(std::string const& str);
+
+private:
+    Config mConfig{};
+    std::vector<std::vector<double>> mNameToMeasurements{};
+};
+ANKERL_NANOBENCH(IGNORE_PADDED_POP)
+
+/**
+ * An extremely fast random generator. Currently, this implements *RomuDuoJr*, developed by Mark Overton. Source:
+ * http://www.romu-random.org/
+ *
+ * RomuDuoJr is extremely fast and provides reasonable good randomness. Not enough for large jobs, but definitely
+ * good enough for a benchmarking framework.
+ *
+ *  * Estimated capacity: @f$ 2^{51} @f$ bytes
+ *  * Register pressure: 4
+ *  * State size: 128 bits
+ *
+ * This random generator is a drop-in replacement for the generators supplied by ``<random>``. It is not
+ * cryptographically secure. It's intended purpose is to be very fast so that benchmarks that make use
+ * of randomness are not distorted too much by the random generator.
+ *
+ * Rng also provides a few non-standard helpers, optimized for speed.
+ */
+class Rng final {
+public:
+    /**
+     * @brief This RNG provides 64bit randomness.
+     */
+    using result_type = uint64_t;
+
+    static constexpr uint64_t(min)();
+    static constexpr uint64_t(max)();
+
+    /**
+     * As a safety precaution, we don't allow copying. Copying a PRNG would mean you would have two random generators that produce the
+     * same sequence, which is generally not what one wants. Instead create a new rng with the default constructor Rng(), which is
+     * automatically seeded from `std::random_device`. If you really need a copy, use `copy()`.
+     */
+    Rng(Rng const&) = delete;
+
+    /**
+     * Same as Rng(Rng const&), we don't allow assignment. If you need a new Rng create one with the default constructor Rng().
+     */
+    Rng& operator=(Rng const&) = delete;
+
+    // moving is ok
+    Rng(Rng&&) noexcept = default;
+    Rng& operator=(Rng&&) noexcept = default;
+    ~Rng() noexcept = default;
+
+    /**
+     * @brief Creates a new Random generator with random seed.
+     *
+     * Instead of a default seed (as the random generators from the STD), this properly seeds the random generator from
+     * `std::random_device`. It guarantees correct seeding. Note that seeding can be relatively slow, depending on the source of
+     * randomness used. So it is best to create a Rng once and use it for all your randomness purposes.
+     */
+    Rng();
+
+    /*!
+      Creates a new Rng that is seeded with a specific seed. Each Rng created from the same seed will produce the same randomness
+      sequence. This can be useful for deterministic behavior.
+
+      @verbatim embed:rst
+      .. note::
+
+         The random algorithm might change between nanobench releases. Whenever a faster and/or better random
+         generator becomes available, I will switch the implementation.
+      @endverbatim
+
+      As per the Romu paper, this seeds the Rng with splitMix64 algorithm and performs 10 initial rounds for further mixing up of the
+      internal state.
+
+      @param seed  The 64bit seed. All values are allowed, even 0.
+     */
+    explicit Rng(uint64_t seed) noexcept;
+    Rng(uint64_t x, uint64_t y) noexcept;
+    explicit Rng(std::vector<uint64_t> const& data);
+
+    /**
+     * Creates a copy of the Rng, thus the copy provides exactly the same random sequence as the original.
+     */
+    ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
+
+    /**
+     * @brief Produces a 64bit random value. This should be very fast, thus it is marked as inline. In my benchmark, this is ~46 times
+     * faster than `std::default_random_engine` for producing 64bit random values. It seems that the fastest std contender is
+     * `std::mt19937_64`. Still, this RNG is 2-3 times as fast.
+     *
+     * @return uint64_t The next 64 bit random value.
+     */
+    inline uint64_t operator()() noexcept;
+
+    // This is slightly biased. See
+
+    /**
+     * Generates a random number between 0 and range (excluding range).
+     *
+     * The algorithm only produces 32bit numbers, and is slightly biased. The effect is quite small unless your range is close to the
+     * maximum value of an integer. It is possible to correct the bias with rejection sampling (see
+     * [here](https://lemire.me/blog/2016/06/30/fast-random-shuffling/), but this is most likely irrelevant in practices for the
+     * purposes of this Rng.
+     *
+     * See Daniel Lemire's blog post [A fast alternative to the modulo
+     * reduction](https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/)
+     *
+     * @param range Upper exclusive range. E.g a value of 3 will generate random numbers 0, 1, 2.
+     * @return uint32_t Generated random values in range [0, range(.
+     */
+    inline uint32_t bounded(uint32_t range) noexcept;
+
+    // random double in range [0, 1(
+    // see http://prng.di.unimi.it/
+
+    /**
+     * Provides a random uniform double value between 0 and 1. This uses the method described in [Generating uniform doubles in the
+     * unit interval](http://prng.di.unimi.it/), and is extremely fast.
+     *
+     * @return double Uniformly distributed double value in range [0,1(, excluding 1.
+     */
+    inline double uniform01() noexcept;
+
+    /**
+     * Shuffles all entries in the given container. Although this has a slight bias due to the implementation of bounded(), this is
+     * preferable to `std::shuffle` because it is over 5 times faster. See Daniel Lemire's blog post [Fast random
+     * shuffling](https://lemire.me/blog/2016/06/30/fast-random-shuffling/).
+     *
+     * @param container The whole container will be shuffled.
+     */
+    template <typename Container>
+    void shuffle(Container& container) noexcept;
+
+    /**
+     * Extracts the full state of the generator, e.g. for serialization. For this RNG this is just 2 values, but to stay API compatible
+     * with future implementations that potentially use more state, we use a vector.
+     *
+     * @return Vector containing the full state:
+     */
+    ANKERL_NANOBENCH(NODISCARD) std::vector<uint64_t> state() const;
+
+private:
+    static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
+
+    uint64_t mX;
+    uint64_t mY;
+};
+
+/**
+ * @brief Main entry point to nanobench's benchmarking facility.
+ *
+ * It holds configuration and results from one or more benchmark runs. Usually it is used in a single line, where the object is
+ * constructed, configured, and then a benchmark is run. E.g. like this:
+ *
+ *     ankerl::nanobench::Bench().unit("byte").batch(1000).run("random fluctuations", [&] {
+ *         // here be the benchmark code
+ *     });
+ *
+ * In that example Bench() constructs the benchmark, it is then configured with unit() and batch(), and after configuration a
+ * benchmark is executed with run(). Once run() has finished, it prints the result to `std::cout`. It would also store the results
+ * in the Bench instance, but in this case the object is immediately destroyed so it's not available any more.
+ */
+ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
+class Bench {
+public:
+    /**
+     * @brief Creates a new benchmark for configuration and running of benchmarks.
+     */
+    Bench();
+
+    Bench(Bench&& other) noexcept;
+    Bench& operator=(Bench&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
+    Bench(Bench const& other);
+    Bench& operator=(Bench const& other);
+    ~Bench() noexcept;
+
+    /*!
+      @brief Repeatedly calls `op()` based on the configuration, and performs measurements.
+
+      This call is marked with `noinline` to prevent the compiler to optimize beyond different benchmarks. This can have quite a big
+      effect on benchmark accuracy.
+
+      @verbatim embed:rst
+      .. note::
+
+        Each call to your lambda must have a side effect that the compiler can't possibly optimize it away. E.g. add a result to an
+        externally defined number (like `x` in the above example), and finally call `doNotOptimizeAway` on the variables the compiler
+        must not remove. You can also use :cpp:func:`ankerl::nanobench::doNotOptimizeAway` directly in the lambda, but be aware that
+        this has a small overhead.
+
+      @endverbatim
+
+      @tparam Op The code to benchmark.
+     */
+    template <typename Op>
+    ANKERL_NANOBENCH(NOINLINE)
+    Bench& run(char const* benchmarkName, Op&& op);
+
+    template <typename Op>
+    ANKERL_NANOBENCH(NOINLINE)
+    Bench& run(std::string const& benchmarkName, Op&& op);
+
+    /**
+     * @brief Same as run(char const* benchmarkName, Op op), but instead uses the previously set name.
+     * @tparam Op The code to benchmark.
+     */
+    template <typename Op>
+    ANKERL_NANOBENCH(NOINLINE)
+    Bench& run(Op&& op);
+
+    /**
+     * @brief Title of the benchmark, will be shown in the table header. Changing the title will start a new markdown table.
+     *
+     * @param benchmarkTitle The title of the benchmark.
+     */
+    Bench& title(char const* benchmarkTitle);
+    Bench& title(std::string const& benchmarkTitle);
+
+    /**
+     * @brief Gets the title of the benchmark
+     */
+    ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
+
+    /// Name of the benchmark, will be shown in the table row.
+    Bench& name(char const* benchmarkName);
+    Bench& name(std::string const& benchmarkName);
+    ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
+
+    /**
+     * @brief Set context information.
+     *
+     * The information can be accessed using custom render templates via `{{context(variableName)}}`.
+     * Trying to render a variable that hasn't been set before raises an exception.
+     * Not included in (default) markdown table.
+     *
+     * @see clearContext, render
+     *
+     * @param variableName The name of the context variable.
+     * @param variableValue The value of the context variable.
+     */
+    Bench& context(char const* variableName, char const* variableValue);
+    Bench& context(std::string const& variableName, std::string const& variableValue);
+
+    /**
+     * @brief Reset context information.
+     *
+     * This may improve efficiency when using many context entries,
+     * or improve robustness by removing spurious context entries.
+     *
+     * @see context
+     */
+    Bench& clearContext();
+
+    /**
+     * @brief Sets the batch size.
+     *
+     * E.g. number of processed byte, or some other metric for the size of the processed data in each iteration. If you benchmark
+     * hashing of a 1000 byte long string and want byte/sec as a result, you can specify 1000 as the batch size.
+     *
+     * @tparam T Any input type is internally cast to `double`.
+     * @param b batch size
+     */
+    template <typename T>
+    Bench& batch(T b) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
+
+    /**
+     * @brief Sets the operation unit.
+     *
+     * Defaults to "op". Could be e.g. "byte" for string processing. This is used for the table header, e.g. to show `ns/byte`. Use
+     * singular (*byte*, not *bytes*). A change clears the currently collected results.
+     *
+     * @param unit The unit name.
+     */
+    Bench& unit(char const* unit);
+    Bench& unit(std::string const& unit);
+    ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
+
+    /**
+     * @brief Sets the time unit to be used for the default output.
+     *
+     * Nanobench defaults to using ns (nanoseconds) as output in the markdown. For some benchmarks this is too coarse, so it is
+     * possible to configure this. E.g. use `timeUnit(1ms, "ms")` to show `ms/op` instead of `ns/op`.
+     *
+     * @param tu Time unit to display the results in, default is 1ns.
+     * @param tuName Name for the time unit, default is "ns"
+     */
+    Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
+    ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
+
+    /**
+     * @brief Set the output stream where the resulting markdown table will be printed to.
+     *
+     * The default is `&std::cout`. You can disable all output by setting `nullptr`.
+     *
+     * @param outstream Pointer to output stream, can be `nullptr`.
+     */
+    Bench& output(std::ostream* outstream) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
+
+    /**
+     * Modern processors have a very accurate clock, being able to measure as low as 20 nanoseconds. This is the main trick nanobech to
+     * be so fast: we find out how accurate the clock is, then run the benchmark only so often that the clock's accuracy is good enough
+     * for accurate measurements.
+     *
+     * The default is to run one epoch for 1000 times the clock resolution. So for 20ns resolution and 11 epochs, this gives a total
+     * runtime of
+     *
+     * @f[
+     * 20ns * 1000 * 11 \approx 0.2ms
+     * @f]
+     *
+     * To be precise, nanobench adds a 0-20% random noise to each evaluation. This is to prevent any aliasing effects, and further
+     * improves accuracy.
+     *
+     * Total runtime will be higher though: Some initial time is needed to find out the target number of iterations for each epoch, and
+     * there is some overhead involved to start & stop timers and calculate resulting statistics and writing the output.
+     *
+     * @param multiple Target number of times of clock resolution. Usually 1000 is a good compromise between runtime and accuracy.
+     */
+    Bench& clockResolutionMultiple(size_t multiple) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
+
+    /**
+     * @brief Controls number of epochs, the number of measurements to perform.
+     *
+     * The reported result will be the median of evaluation of each epoch. The higher you choose this, the more
+     * deterministic the result be and outliers will be more easily removed. Also the `err%` will be more accurate the higher this
+     * number is. Note that the `err%` will not necessarily decrease when number of epochs is increased. But it will be a more accurate
+     * representation of the benchmarked code's runtime stability.
+     *
+     * Choose the value wisely. In practice, 11 has been shown to be a reasonable choice between runtime performance and accuracy.
+     * This setting goes hand in hand with minEpochIterations() (or minEpochTime()). If you are more interested in *median* runtime,
+     * you might want to increase epochs(). If you are more interested in *mean* runtime, you might want to increase
+     * minEpochIterations() instead.
+     *
+     * @param numEpochs Number of epochs.
+     */
+    Bench& epochs(size_t numEpochs) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
+
+    /**
+     * @brief Upper limit for the runtime of each epoch.
+     *
+     * As a safety precaution if the clock is not very accurate, we can set an upper limit for the maximum evaluation time per
+     * epoch. Default is 100ms. At least a single evaluation of the benchmark is performed.
+     *
+     * @see minEpochTime, minEpochIterations
+     *
+     * @param t Maximum target runtime for a single epoch.
+     */
+    Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
+
+    /**
+     * @brief Minimum time each epoch should take.
+     *
+     * Default is zero, so we are fully relying on clockResolutionMultiple(). In most cases this is exactly what you want. If you see
+     * that the evaluation is unreliable with a high `err%`, you can increase either minEpochTime() or minEpochIterations().
+     *
+     * @see maxEpochTime, minEpochIterations
+     *
+     * @param t Minimum time each epoch should take.
+     */
+    Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
+
+    /**
+     * @brief Sets the minimum number of iterations each epoch should take.
+     *
+     * Default is 1, and we rely on clockResolutionMultiple(). If the `err%` is high and you want a more smooth result, you might want
+     * to increase the minimum number of iterations, or increase the minEpochTime().
+     *
+     * @see minEpochTime, maxEpochTime, minEpochIterations
+     *
+     * @param numIters Minimum number of iterations per epoch.
+     */
+    Bench& minEpochIterations(uint64_t numIters) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
+
+    /**
+     * Sets exactly the number of iterations for each epoch. Ignores all other epoch limits. This forces nanobench to use exactly
+     * the given number of iterations for each epoch, not more and not less. Default is 0 (disabled).
+     *
+     * @param numIters Exact number of iterations to use. Set to 0 to disable.
+     */
+    Bench& epochIterations(uint64_t numIters) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
+
+    /**
+     * @brief Sets a number of iterations that are initially performed without any measurements.
+     *
+     * Some benchmarks need a few evaluations to warm up caches / database / whatever access. Normally this should not be needed, since
+     * we show the median result so initial outliers will be filtered away automatically. If the warmup effect is large though, you
+     * might want to set it. Default is 0.
+     *
+     * @param numWarmupIters Number of warmup iterations.
+     */
+    Bench& warmup(uint64_t numWarmupIters) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
+
+    /**
+     * @brief Marks the next run as the baseline.
+     *
+     * Call `relative(true)` to mark the run as the baseline. Successive runs will be compared to this run. It is calculated by
+     *
+     * @f[
+     * 100\% * \frac{baseline}{runtime}
+     * @f]
+     *
+     *  * 100% means it is exactly as fast as the baseline
+     *  * >100% means it is faster than the baseline. E.g. 200% means the current run is twice as fast as the baseline.
+     *  * <100% means it is slower than the baseline. E.g. 50% means it is twice as slow as the baseline.
+     *
+     * See the tutorial section "Comparing Results" for example usage.
+     *
+     * @param isRelativeEnabled True to enable processing
+     */
+    Bench& relative(bool isRelativeEnabled) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
+
+    /**
+     * @brief Enables/disables performance counters.
+     *
+     * On Linux nanobench has a powerful feature to use performance counters. This enables counting of retired instructions, count
+     * number of branches, missed branches, etc. On default this is enabled, but you can disable it if you don't need that feature.
+     *
+     * @param showPerformanceCounters True to enable, false to disable.
+     */
+    Bench& performanceCounters(bool showPerformanceCounters) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
+
+    /**
+     * @brief Retrieves all benchmark results collected by the bench object so far.
+     *
+     * Each call to run() generates a Result that is stored within the Bench instance. This is mostly for advanced users who want to
+     * see all the nitty gritty details.
+     *
+     * @return All results collected so far.
+     */
+    ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
+
+    /*!
+      @verbatim embed:rst
+
+      Convenience shortcut to :cpp:func:`ankerl::nanobench::doNotOptimizeAway`.
+
+      @endverbatim
+     */
+    template <typename Arg>
+    Bench& doNotOptimizeAway(Arg&& arg);
+
+    /*!
+      @verbatim embed:rst
+
+      Sets N for asymptotic complexity calculation, so it becomes possible to calculate `Big O
+      <https://en.wikipedia.org/wiki/Big_O_notation>`_ from multiple benchmark evaluations.
+
+      Use :cpp:func:`ankerl::nanobench::Bench::complexityBigO` when the evaluation has finished. See the tutorial
+      :ref:`asymptotic-complexity` for details.
+
+      @endverbatim
+
+      @tparam T Any type is cast to `double`.
+      @param n Length of N for the next benchmark run, so it is possible to calculate `bigO`.
+     */
+    template <typename T>
+    Bench& complexityN(T n) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
+
+    /*!
+      Calculates [Big O](https://en.wikipedia.org/wiki/Big_O_notation>) of the results with all preconfigured complexity functions.
+      Currently these complexity functions are fitted into the benchmark results:
+
+       @f$ \mathcal{O}(1) @f$,
+       @f$ \mathcal{O}(n) @f$,
+       @f$ \mathcal{O}(\log{}n) @f$,
+       @f$ \mathcal{O}(n\log{}n) @f$,
+       @f$ \mathcal{O}(n^2) @f$,
+       @f$ \mathcal{O}(n^3) @f$.
+
+      If we e.g. evaluate the complexity of `std::sort`, this is the result of `std::cout << bench.complexityBigO()`:
+
+      ```
+      |   coefficient |   err% | complexity
+      |--------------:|-------:|------------
+      |   5.08935e-09 |   2.6% | O(n log n)
+      |   6.10608e-08 |   8.0% | O(n)
+      |   1.29307e-11 |  47.2% | O(n^2)
+      |   2.48677e-15 |  69.6% | O(n^3)
+      |   9.88133e-06 | 132.3% | O(log n)
+      |   5.98793e-05 | 162.5% | O(1)
+      ```
+
+      So in this case @f$ \mathcal{O}(n\log{}n) @f$ provides the best approximation.
+
+      @verbatim embed:rst
+      See the tutorial :ref:`asymptotic-complexity` for details.
+      @endverbatim
+      @return Evaluation results, which can be printed or otherwise inspected.
+     */
+    std::vector<BigO> complexityBigO() const;
+
+    /**
+     * @brief Calculates bigO for a custom function.
+     *
+     * E.g. to calculate the mean squared error for @f$ \mathcal{O}(\log{}\log{}n) @f$, which is not part of the default set of
+     * complexityBigO(), you can do this:
+     *
+     * ```
+     * auto logLogN = bench.complexityBigO("O(log log n)", [](double n) {
+     *     return std::log2(std::log2(n));
+     * });
+     * ```
+     *
+     * The resulting mean squared error can be printed with `std::cout << logLogN`. E.g. it prints something like this:
+     *
+     * ```text
+     * 2.46985e-05 * O(log log n), rms=1.48121
+     * ```
+     *
+     * @tparam Op Type of mapping operation.
+     * @param name Name for the function, e.g. "O(log log n)"
+     * @param op Op's operator() maps a `double` with the desired complexity function, e.g. `log2(log2(n))`.
+     * @return BigO Error calculation, which is streamable to std::cout.
+     */
+    template <typename Op>
+    BigO complexityBigO(char const* name, Op op) const;
+
+    template <typename Op>
+    BigO complexityBigO(std::string const& name, Op op) const;
+
+    /*!
+      @verbatim embed:rst
+
+      Convenience shortcut to :cpp:func:`ankerl::nanobench::render`.
+
+      @endverbatim
+     */
+    Bench& render(char const* templateContent, std::ostream& os);
+    Bench& render(std::string const& templateContent, std::ostream& os);
+
+    Bench& config(Config const& benchmarkConfig);
+    ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
+
+private:
+    Config mConfig{};
+    std::vector<Result> mResults{};
+};
+ANKERL_NANOBENCH(IGNORE_PADDED_POP)
+
+/**
+ * @brief Makes sure none of the given arguments are optimized away by the compiler.
+ *
+ * @tparam Arg Type of the argument that shouldn't be optimized away.
+ * @param arg The input that we mark as being used, even though we don't do anything with it.
+ */
+template <typename Arg>
+void doNotOptimizeAway(Arg&& arg);
+
+namespace detail {
+
+#if defined(_MSC_VER)
+void doNotOptimizeAwaySink(void const*);
+
+template <typename T>
+void doNotOptimizeAway(T const& val);
+
+#else
+
+// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
+// this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
+// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L443-L446
+template <typename T>
+void doNotOptimizeAway(T const& val) {
+    // NOLINTNEXTLINE(hicpp-no-assembler)
+    asm volatile("" : : "r,m"(val) : "memory");
+}
+
+template <typename T>
+void doNotOptimizeAway(T& val) {
+#    if defined(__clang__)
+    // NOLINTNEXTLINE(hicpp-no-assembler)
+    asm volatile("" : "+r,m"(val) : : "memory");
+#    else
+    // NOLINTNEXTLINE(hicpp-no-assembler)
+    asm volatile("" : "+m,r"(val) : : "memory");
+#    endif
+}
+#endif
+
+// internally used, but visible because run() is templated.
+// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
+// having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
+ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
+class IterationLogic {
+public:
+    explicit IterationLogic(Bench const& bench);
+    IterationLogic(IterationLogic&&) = delete;
+    IterationLogic& operator=(IterationLogic&&) = delete;
+    IterationLogic(IterationLogic const&) = delete;
+    IterationLogic& operator=(IterationLogic const&) = delete;
+    ~IterationLogic();
+
+    ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
+    void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
+    void moveResultTo(std::vector<Result>& results) noexcept;
+
+private:
+    struct Impl;
+    Impl* mPimpl;
+};
+ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
+
+ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
+class PerformanceCounters {
+public:
+    PerformanceCounters(PerformanceCounters const&) = delete;
+    PerformanceCounters(PerformanceCounters&&) = delete;
+    PerformanceCounters& operator=(PerformanceCounters const&) = delete;
+    PerformanceCounters& operator=(PerformanceCounters&&) = delete;
+
+    PerformanceCounters();
+    ~PerformanceCounters();
+
+    void beginMeasure();
+    void endMeasure();
+    void updateResults(uint64_t numIters);
+
+    ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
+
+private:
+#if ANKERL_NANOBENCH(PERF_COUNTERS)
+    LinuxPerformanceCounters* mPc = nullptr;
+#endif
+    PerfCountSet<uint64_t> mVal{};
+    PerfCountSet<bool> mHas{};
+};
+ANKERL_NANOBENCH(IGNORE_PADDED_POP)
+
+// Gets the singleton
+PerformanceCounters& performanceCounters();
+
+} // namespace detail
+
+class BigO {
+public:
+    using RangeMeasure = std::vector<std::pair<double, double>>;
+
+    template <typename Op>
+    static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op) {
+        for (auto& rangeMeasure : data) {
+            rangeMeasure.first = op(rangeMeasure.first);
+        }
+        return data;
+    }
+
+    static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
+
+    template <typename Op>
+    BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
+        : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
+
+    template <typename Op>
+    BigO(std::string bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
+        : BigO(std::move(bigOName), mapRangeMeasure(rangeMeasure, rangeToN)) {}
+
+    BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
+    BigO(std::string bigOName, RangeMeasure const& scaledRangeMeasure);
+    ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
+    ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
+
+private:
+    std::string mName{};
+    double mConstant{};
+    double mNormalizedRootMeanSquare{};
+};
+std::ostream& operator<<(std::ostream& os, BigO const& bigO);
+std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
+
+} // namespace nanobench
+} // namespace ankerl
+
+// implementation /////////////////////////////////////////////////////////////////////////////////
+
+namespace ankerl {
+namespace nanobench {
+
+constexpr uint64_t(Rng::min)() {
+    return 0;
+}
+
+constexpr uint64_t(Rng::max)() {
+    return (std::numeric_limits<uint64_t>::max)();
+}
+
+ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
+uint64_t Rng::operator()() noexcept {
+    auto x = mX;
+
+    mX = UINT64_C(15241094284759029579) * mY;
+    mY = rotl(mY - x, 27);
+
+    return x;
+}
+
+ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
+uint32_t Rng::bounded(uint32_t range) noexcept {
+    uint64_t const r32 = static_cast<uint32_t>(operator()());
+    auto multiresult = r32 * range;
+    return static_cast<uint32_t>(multiresult >> 32U);
+}
+
+double Rng::uniform01() noexcept {
+    auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
+    // can't use union in c++ here for type puning, it's undefined behavior.
+    // std::memcpy is optimized anyways.
+    double d{};
+    std::memcpy(&d, &i, sizeof(double));
+    return d - 1.0;
+}
+
+template <typename Container>
+void Rng::shuffle(Container& container) noexcept {
+    auto i = container.size();
+    while (i > 1U) {
+        using std::swap;
+        auto n = operator()();
+        // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80)
+        auto b1 = static_cast<decltype(i)>((static_cast<uint32_t>(n) * static_cast<uint64_t>(i)) >> 32U);
+        swap(container[--i], container[b1]);
+
+        auto b2 = static_cast<decltype(i)>(((n >> 32U) * static_cast<uint64_t>(i)) >> 32U);
+        swap(container[--i], container[b2]);
+    }
+}
+
+ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
+constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
+    return (x << k) | (x >> (64U - k));
+}
+
+template <typename Op>
+ANKERL_NANOBENCH_NO_SANITIZE("integer")
+Bench& Bench::run(Op&& op) {
+    // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
+    detail::IterationLogic iterationLogic(*this);
+    auto& pc = detail::performanceCounters();
+
+    while (auto n = iterationLogic.numIters()) {
+        pc.beginMeasure();
+        Clock::time_point const before = Clock::now();
+        while (n-- > 0) {
+            op();
+        }
+        Clock::time_point const after = Clock::now();
+        pc.endMeasure();
+        pc.updateResults(iterationLogic.numIters());
+        iterationLogic.add(after - before, pc);
+    }
+    iterationLogic.moveResultTo(mResults);
+    return *this;
+}
+
+// Performs all evaluations.
+template <typename Op>
+Bench& Bench::run(char const* benchmarkName, Op&& op) {
+    name(benchmarkName);
+    return run(std::forward<Op>(op));
+}
+
+template <typename Op>
+Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
+    name(benchmarkName);
+    return run(std::forward<Op>(op));
+}
+
+template <typename Op>
+BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
+    return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
+}
+
+template <typename Op>
+BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
+    return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
+}
+
+// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
+// Any argument is cast to double.
+template <typename T>
+Bench& Bench::batch(T b) noexcept {
+    mConfig.mBatch = static_cast<double>(b);
+    return *this;
+}
+
+// Sets the computation complexity of the next run. Any argument is cast to double.
+template <typename T>
+Bench& Bench::complexityN(T n) noexcept {
+    mConfig.mComplexityN = static_cast<double>(n);
+    return *this;
+}
+
+// Convenience: makes sure none of the given arguments are optimized away by the compiler.
+template <typename Arg>
+Bench& Bench::doNotOptimizeAway(Arg&& arg) {
+    detail::doNotOptimizeAway(std::forward<Arg>(arg));
+    return *this;
+}
+
+// Makes sure none of the given arguments are optimized away by the compiler.
+template <typename Arg>
+void doNotOptimizeAway(Arg&& arg) {
+    detail::doNotOptimizeAway(std::forward<Arg>(arg));
+}
+
+namespace detail {
+
+#if defined(_MSC_VER)
+template <typename T>
+void doNotOptimizeAway(T const& val) {
+    doNotOptimizeAwaySink(&val);
+}
+
+#endif
+
+} // namespace detail
+} // namespace nanobench
+} // namespace ankerl
+
+#if defined(ANKERL_NANOBENCH_IMPLEMENT)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// implementation part - only visible in .cpp
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#    include <algorithm> // sort, reverse
+#    include <atomic>    // compare_exchange_strong in loop overhead
+#    include <cstdlib>   // getenv
+#    include <cstring>   // strstr, strncmp
+#    include <fstream>   // ifstream to parse proc files
+#    include <iomanip>   // setw, setprecision
+#    include <iostream>  // cout
+#    include <numeric>   // accumulate
+#    include <random>    // random_device
+#    include <sstream>   // to_s in Number
+#    include <stdexcept> // throw for rendering templates
+#    include <tuple>     // std::tie
+#    if defined(__linux__)
+#        include <unistd.h> //sysconf
+#    endif
+#    if ANKERL_NANOBENCH(PERF_COUNTERS)
+#        include <map> // map
+
+#        include <linux/perf_event.h>
+#        include <sys/ioctl.h>
+#        include <sys/syscall.h>
+#    endif
+
+// declarations ///////////////////////////////////////////////////////////////////////////////////
+
+namespace ankerl {
+namespace nanobench {
+
+// helper stuff that is only intended to be used internally
+namespace detail {
+
+struct TableInfo;
+
+// formatting utilities
+namespace fmt {
+
+class NumSep;
+class StreamStateRestorer;
+class Number;
+class MarkDownColumn;
+class MarkDownCode;
+
+} // namespace fmt
+} // namespace detail
+} // namespace nanobench
+} // namespace ankerl
+
+// definitions ////////////////////////////////////////////////////////////////////////////////////
+
+namespace ankerl {
+namespace nanobench {
+
+uint64_t splitMix64(uint64_t& state) noexcept;
+
+namespace detail {
+
+// helpers to get double values
+template <typename T>
+inline double d(T t) noexcept {
+    return static_cast<double>(t);
+}
+inline double d(Clock::duration duration) noexcept {
+    return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
+}
+
+// Calculates clock resolution once, and remembers the result
+inline Clock::duration clockResolution() noexcept;
+
+} // namespace detail
+
+namespace templates {
+
+char const* csv() noexcept {
+    return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
+{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
+{{/result}})DELIM";
+}
+
+char const* htmlBoxplot() noexcept {
+    return R"DELIM(<html>
+
+<head>
+    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+</head>
+
+<body>
+    <div id="myDiv"></div>
+    <script>
+        var data = [
+            {{#result}}{
+                name: '{{name}}',
+                y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
+            },
+            {{/result}}
+        ];
+        var title = '{{title}}';
+
+        data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
+        var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
+    </script>
+</body>
+
+</html>)DELIM";
+}
+
+char const* pyperf() noexcept {
+    return R"DELIM({
+    "benchmarks": [
+        {
+            "runs": [
+                {
+                    "values": [
+{{#measurement}}                        {{elapsed}}{{^-last}},
+{{/last}}{{/measurement}}
+                    ]
+                }
+            ]
+        }
+    ],
+    "metadata": {
+        "loops": {{sum(iterations)}},
+        "inner_loops": {{batch}},
+        "name": "{{title}}",
+        "unit": "second"
+    },
+    "version": "1.0"
+})DELIM";
+}
+
+char const* json() noexcept {
+    return R"DELIM({
+    "results": [
+{{#result}}        {
+            "title": "{{title}}",
+            "name": "{{name}}",
+            "unit": "{{unit}}",
+            "batch": {{batch}},
+            "complexityN": {{complexityN}},
+            "epochs": {{epochs}},
+            "clockResolution": {{clockResolution}},
+            "clockResolutionMultiple": {{clockResolutionMultiple}},
+            "maxEpochTime": {{maxEpochTime}},
+            "minEpochTime": {{minEpochTime}},
+            "minEpochIterations": {{minEpochIterations}},
+            "epochIterations": {{epochIterations}},
+            "warmup": {{warmup}},
+            "relative": {{relative}},
+            "median(elapsed)": {{median(elapsed)}},
+            "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
+            "median(instructions)": {{median(instructions)}},
+            "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
+            "median(cpucycles)": {{median(cpucycles)}},
+            "median(contextswitches)": {{median(contextswitches)}},
+            "median(pagefaults)": {{median(pagefaults)}},
+            "median(branchinstructions)": {{median(branchinstructions)}},
+            "median(branchmisses)": {{median(branchmisses)}},
+            "totalTime": {{sumProduct(iterations, elapsed)}},
+            "measurements": [
+{{#measurement}}                {
+                    "iterations": {{iterations}},
+                    "elapsed": {{elapsed}},
+                    "pagefaults": {{pagefaults}},
+                    "cpucycles": {{cpucycles}},
+                    "contextswitches": {{contextswitches}},
+                    "instructions": {{instructions}},
+                    "branchinstructions": {{branchinstructions}},
+                    "branchmisses": {{branchmisses}}
+                }{{^-last}},{{/-last}}
+{{/measurement}}            ]
+        }{{^-last}},{{/-last}}
+{{/result}}    ]
+})DELIM";
+}
+
+ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
+struct Node {
+    enum class Type { tag, content, section, inverted_section };
+
+    char const* begin;
+    char const* end;
+    std::vector<Node> children;
+    Type type;
+
+    template <size_t N>
+    // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+    bool operator==(char const (&str)[N]) const noexcept {
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
+        return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
+    }
+};
+ANKERL_NANOBENCH(IGNORE_PADDED_POP)
+
+// NOLINTNEXTLINE(misc-no-recursion)
+static std::vector<Node> parseMustacheTemplate(char const** tpl) {
+    std::vector<Node> nodes;
+
+    while (true) {
+        auto const* begin = std::strstr(*tpl, "{{");
+        auto const* end = begin;
+        if (begin != nullptr) {
+            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+            begin += 2;
+            end = std::strstr(begin, "}}");
+        }
+
+        if (begin == nullptr || end == nullptr) {
+            // nothing found, finish node
+            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+            nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
+            return nodes;
+        }
+
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+        nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
+
+        // we found a tag
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+        *tpl = end + 2;
+        switch (*begin) {
+        case '/':
+            // finished! bail out
+            return nodes;
+
+        case '#':
+            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+            nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
+            break;
+
+        case '^':
+            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+            nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
+            break;
+
+        default:
+            nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
+            break;
+        }
+    }
+}
+
+static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
+    ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
+    bool const matchFirst = n == "-first";
+    bool const matchLast = n == "-last";
+    if (!matchFirst && !matchLast) {
+        return false;
+    }
+
+    bool doWrite = false;
+    if (n.type == Node::Type::section) {
+        doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
+    } else if (n.type == Node::Type::inverted_section) {
+        doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
+    }
+
+    if (doWrite) {
+        for (auto const& child : n.children) {
+            if (child.type == Node::Type::content) {
+                out.write(child.begin, std::distance(child.begin, child.end));
+            }
+        }
+    }
+    return true;
+}
+
+static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
+    matchResult.clear();
+    auto idxOpen = str.find('(');
+    auto idxClose = str.find(')', idxOpen);
+    if (idxClose == std::string::npos) {
+        return false;
+    }
+
+    matchResult.emplace_back(str.substr(0, idxOpen));
+
+    // split by comma
+    matchResult.emplace_back();
+    for (size_t i = idxOpen + 1; i != idxClose; ++i) {
+        if (str[i] == ' ' || str[i] == '\t') {
+            // skip whitespace
+            continue;
+        }
+        if (str[i] == ',') {
+            // got a comma => new string
+            matchResult.emplace_back();
+            continue;
+        }
+        // no whitespace no comma, append
+        matchResult.back() += str[i];
+    }
+    return true;
+}
+
+static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
+    using detail::d;
+
+    if (n == "title") {
+        out << config.mBenchmarkTitle;
+        return true;
+    }
+    if (n == "name") {
+        out << config.mBenchmarkName;
+        return true;
+    }
+    if (n == "unit") {
+        out << config.mUnit;
+        return true;
+    }
+    if (n == "batch") {
+        out << config.mBatch;
+        return true;
+    }
+    if (n == "complexityN") {
+        out << config.mComplexityN;
+        return true;
+    }
+    if (n == "epochs") {
+        out << config.mNumEpochs;
+        return true;
+    }
+    if (n == "clockResolution") {
+        out << d(detail::clockResolution());
+        return true;
+    }
+    if (n == "clockResolutionMultiple") {
+        out << config.mClockResolutionMultiple;
+        return true;
+    }
+    if (n == "maxEpochTime") {
+        out << d(config.mMaxEpochTime);
+        return true;
+    }
+    if (n == "minEpochTime") {
+        out << d(config.mMinEpochTime);
+        return true;
+    }
+    if (n == "minEpochIterations") {
+        out << config.mMinEpochIterations;
+        return true;
+    }
+    if (n == "epochIterations") {
+        out << config.mEpochIterations;
+        return true;
+    }
+    if (n == "warmup") {
+        out << config.mWarmup;
+        return true;
+    }
+    if (n == "relative") {
+        out << config.mIsRelative;
+        return true;
+    }
+    return false;
+}
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
+    if (generateConfigTag(n, r.config(), out)) {
+        return out;
+    }
+    // match e.g. "median(elapsed)"
+    // g++ 4.8 doesn't implement std::regex :(
+    // static std::regex const regOpArg1("^([a-zA-Z]+)\\(([a-zA-Z]*)\\)$");
+    // std::cmatch matchResult;
+    // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
+    std::vector<std::string> matchResult;
+    if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
+        if (matchResult.size() == 2) {
+            if (matchResult[0] == "context") {
+                return out << r.context(matchResult[1]);
+            }
+
+            auto m = Result::fromString(matchResult[1]);
+            if (m == Result::Measure::_size) {
+                return out << 0.0;
+            }
+
+            if (matchResult[0] == "median") {
+                return out << r.median(m);
+            }
+            if (matchResult[0] == "average") {
+                return out << r.average(m);
+            }
+            if (matchResult[0] == "medianAbsolutePercentError") {
+                return out << r.medianAbsolutePercentError(m);
+            }
+            if (matchResult[0] == "sum") {
+                return out << r.sum(m);
+            }
+            if (matchResult[0] == "minimum") {
+                return out << r.minimum(m);
+            }
+            if (matchResult[0] == "maximum") {
+                return out << r.maximum(m);
+            }
+        } else if (matchResult.size() == 3) {
+            auto m1 = Result::fromString(matchResult[1]);
+            auto m2 = Result::fromString(matchResult[2]);
+            if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) {
+                return out << 0.0;
+            }
+
+            if (matchResult[0] == "sumProduct") {
+                return out << r.sumProduct(m1, m2);
+            }
+        }
+    }
+
+    // match e.g. "sumProduct(elapsed, iterations)"
+    // static std::regex const regOpArg2("^([a-zA-Z]+)\\(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\)$");
+
+    // nothing matches :(
+    throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
+}
+
+static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
+    for (auto const& n : nodes) {
+        if (!generateFirstLast(n, idx, r.size(), out)) {
+            ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
+            switch (n.type) {
+            case Node::Type::content:
+                out.write(n.begin, std::distance(n.begin, n.end));
+                break;
+
+            case Node::Type::inverted_section:
+                throw std::runtime_error("got a inverted section inside measurement");
+
+            case Node::Type::section:
+                throw std::runtime_error("got a section inside measurement");
+
+            case Node::Type::tag: {
+                auto m = Result::fromString(std::string(n.begin, n.end));
+                if (m == Result::Measure::_size || !r.has(m)) {
+                    out << 0.0;
+                } else {
+                    out << r.get(idx, m);
+                }
+                break;
+            }
+            }
+        }
+    }
+}
+
+static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
+    auto const& r = results[idx];
+    for (auto const& n : nodes) {
+        if (!generateFirstLast(n, idx, results.size(), out)) {
+            ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
+            switch (n.type) {
+            case Node::Type::content:
+                out.write(n.begin, std::distance(n.begin, n.end));
+                break;
+
+            case Node::Type::inverted_section:
+                throw std::runtime_error("got a inverted section inside result");
+
+            case Node::Type::section:
+                if (n == "measurement") {
+                    for (size_t i = 0; i < r.size(); ++i) {
+                        generateResultMeasurement(n.children, i, r, out);
+                    }
+                } else {
+                    throw std::runtime_error("got a section inside result");
+                }
+                break;
+
+            case Node::Type::tag:
+                generateResultTag(n, r, out);
+                break;
+            }
+        }
+    }
+}
+
+} // namespace templates
+
+// helper stuff that only intended to be used internally
+namespace detail {
+
+char const* getEnv(char const* name);
+bool isEndlessRunning(std::string const& name);
+bool isWarningsEnabled();
+
+template <typename T>
+T parseFile(std::string const& filename, bool* fail);
+
+void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
+void printStabilityInformationOnce(std::ostream* outStream);
+
+// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
+uint64_t& singletonHeaderHash() noexcept;
+
+// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
+Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
+
+// formatting utilities
+namespace fmt {
+
+// adds thousands separator to numbers
+ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
+class NumSep : public std::numpunct<char> {
+public:
+    explicit NumSep(char sep);
+    char do_thousands_sep() const override;
+    std::string do_grouping() const override;
+
+private:
+    char mSep;
+};
+ANKERL_NANOBENCH(IGNORE_PADDED_POP)
+
+// RAII to save & restore a stream's state
+ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
+class StreamStateRestorer {
+public:
+    explicit StreamStateRestorer(std::ostream& s);
+    ~StreamStateRestorer();
+
+    // sets back all stream info that we remembered at construction
+    void restore();
+
+    // don't allow copying / moving
+    StreamStateRestorer(StreamStateRestorer const&) = delete;
+    StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
+    StreamStateRestorer(StreamStateRestorer&&) = delete;
+    StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
+
+private:
+    std::ostream& mStream;
+    std::locale mLocale;
+    std::streamsize const mPrecision;
+    std::streamsize const mWidth;
+    std::ostream::char_type const mFill;
+    std::ostream::fmtflags const mFmtFlags;
+};
+ANKERL_NANOBENCH(IGNORE_PADDED_POP)
+
+// Number formatter
+class Number {
+public:
+    Number(int width, int precision, double value);
+    Number(int width, int precision, int64_t value);
+    ANKERL_NANOBENCH(NODISCARD) std::string to_s() const;
+
+private:
+    friend std::ostream& operator<<(std::ostream& os, Number const& n);
+    std::ostream& write(std::ostream& os) const;
+
+    int mWidth;
+    int mPrecision;
+    double mValue;
+};
+
+// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
+std::string to_s(uint64_t n);
+
+std::ostream& operator<<(std::ostream& os, Number const& n);
+
+class MarkDownColumn {
+public:
+    MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept;
+    ANKERL_NANOBENCH(NODISCARD) std::string title() const;
+    ANKERL_NANOBENCH(NODISCARD) std::string separator() const;
+    ANKERL_NANOBENCH(NODISCARD) std::string invalid() const;
+    ANKERL_NANOBENCH(NODISCARD) std::string value() const;
+
+private:
+    int mWidth;
+    int mPrecision;
+    std::string mTitle;
+    std::string mSuffix;
+    double mValue;
+};
+
+// Formats any text as markdown code, escaping backticks.
+class MarkDownCode {
+public:
+    explicit MarkDownCode(std::string const& what);
+
+private:
+    friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
+    std::ostream& write(std::ostream& os) const;
+
+    std::string mWhat{};
+};
+
+std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
+
+} // namespace fmt
+} // namespace detail
+} // namespace nanobench
+} // namespace ankerl
+
+// implementation /////////////////////////////////////////////////////////////////////////////////
+
+namespace ankerl {
+namespace nanobench {
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
+    detail::fmt::StreamStateRestorer const restorer(out);
+
+    out.precision(std::numeric_limits<double>::digits10);
+    auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
+
+    for (auto const& n : nodes) {
+        ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
+        switch (n.type) {
+        case templates::Node::Type::content:
+            out.write(n.begin, std::distance(n.begin, n.end));
+            break;
+
+        case templates::Node::Type::inverted_section:
+            throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
+
+        case templates::Node::Type::section:
+            if (n == "result") {
+                const size_t nbResults = results.size();
+                for (size_t i = 0; i < nbResults; ++i) {
+                    generateResult(n.children, i, results, out);
+                }
+            } else if (n == "measurement") {
+                if (results.size() != 1) {
+                    throw std::runtime_error(
+                        "render: can only use section 'measurement' here if there is a single result, but there are " +
+                        detail::fmt::to_s(results.size()));
+                }
+                // when we only have a single result, we can immediately go into its measurement.
+                auto const& r = results.front();
+                for (size_t i = 0; i < r.size(); ++i) {
+                    generateResultMeasurement(n.children, i, r, out);
+                }
+            } else {
+                throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
+            }
+            break;
+
+        case templates::Node::Type::tag:
+            if (results.size() == 1) {
+                // result & config are both supported there
+                generateResultTag(n, results.front(), out);
+            } else {
+                // This just uses the last result's config.
+                if (!generateConfigTag(n, results.back().config(), out)) {
+                    throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
+                }
+            }
+            break;
+        }
+    }
+}
+
+void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
+    render(mustacheTemplate.c_str(), results, out);
+}
+
+void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
+    render(mustacheTemplate, bench.results(), out);
+}
+
+void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
+    render(mustacheTemplate.c_str(), bench.results(), out);
+}
+
+namespace detail {
+
+PerformanceCounters& performanceCounters() {
+#    if defined(__clang__)
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wexit-time-destructors"
+#    endif
+    static PerformanceCounters pc;
+#    if defined(__clang__)
+#        pragma clang diagnostic pop
+#    endif
+    return pc;
+}
+
+// Windows version of doNotOptimizeAway
+// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L514
+// see https://github.com/facebook/folly/blob/v2023.01.30.00/folly/lang/Hint-inl.h#L54-L58
+// see https://learn.microsoft.com/en-us/cpp/preprocessor/optimize
+#    if defined(_MSC_VER)
+#        pragma optimize("", off)
+void doNotOptimizeAwaySink(void const*) {}
+#        pragma optimize("", on)
+#    endif
+
+template <typename T>
+T parseFile(std::string const& filename, bool* fail) {
+    std::ifstream fin(filename); // NOLINT(misc-const-correctness)
+    T num{};
+    fin >> num;
+    if (fail != nullptr) {
+        *fail = fin.fail();
+    }
+    return num;
+}
+
+char const* getEnv(char const* name) {
+#    if defined(_MSC_VER)
+#        pragma warning(push)
+#        pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
+#    endif
+    return std::getenv(name); // NOLINT(concurrency-mt-unsafe)
+#    if defined(_MSC_VER)
+#        pragma warning(pop)
+#    endif
+}
+
+bool isEndlessRunning(std::string const& name) {
+    auto const* const endless = getEnv("NANOBENCH_ENDLESS");
+    return nullptr != endless && endless == name;
+}
+
+// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
+bool isWarningsEnabled() {
+    auto const* const suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
+    return nullptr == suppression || suppression == std::string("0");
+}
+
+void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
+    warnings.clear();
+    recommendations.clear();
+
+#    if defined(DEBUG)
+    warnings.emplace_back("DEBUG defined");
+    bool const recommendCheckFlags = true;
+#    else
+    bool const recommendCheckFlags = false;
+#    endif
+
+    bool recommendPyPerf = false;
+#    if defined(__linux__)
+    auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
+    if (nprocs <= 0) {
+        warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
+    } else {
+        // check frequency scaling
+        for (long id = 0; id < nprocs; ++id) {
+            auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
+            auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
+            auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq", nullptr);
+            auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq", nullptr);
+            if (minFreq != maxFreq) {
+                auto minMHz = d(minFreq) / 1000.0;
+                auto maxMHz = d(maxFreq) / 1000.0;
+                warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
+                                      detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
+                                      " MHz");
+                recommendPyPerf = true;
+                break;
+            }
+        }
+
+        auto fail = false;
+        auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", &fail);
+        if (!fail && "performance" != currentGovernor) {
+            warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
+            recommendPyPerf = true;
+        }
+
+        auto noTurbo = parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo", &fail);
+        if (!fail && noTurbo == 0) {
+            warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
+            recommendPyPerf = true;
+        }
+    }
+#    endif
+
+    if (recommendCheckFlags) {
+        recommendations.emplace_back("Make sure you compile for Release");
+    }
+    if (recommendPyPerf) {
+        recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
+    }
+}
+
+void printStabilityInformationOnce(std::ostream* outStream) {
+    static bool shouldPrint = true;
+    if (shouldPrint && (nullptr != outStream) && isWarningsEnabled()) {
+        auto& os = *outStream;
+        shouldPrint = false;
+        std::vector<std::string> warnings;
+        std::vector<std::string> recommendations;
+        gatherStabilityInformation(warnings, recommendations);
+        if (warnings.empty()) {
+            return;
+        }
+
+        os << "Warning, results might be unstable:" << std::endl;
+        for (auto const& w : warnings) {
+            os << "* " << w << std::endl;
+        }
+
+        os << std::endl << "Recommendations" << std::endl;
+        for (auto const& r : recommendations) {
+            os << "* " << r << std::endl;
+        }
+    }
+}
+
+// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
+uint64_t& singletonHeaderHash() noexcept {
+    static uint64_t sHeaderHash{};
+    return sHeaderHash;
+}
+
+ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
+inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
+    return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
+}
+
+// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
+Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
+    auto bestDuration = Clock::duration::max();
+    Clock::time_point tBegin;
+    Clock::time_point tEnd;
+    for (size_t i = 0; i < numEvaluations; ++i) {
+        tBegin = Clock::now();
+        do {
+            tEnd = Clock::now();
+        } while (tBegin == tEnd);
+        bestDuration = (std::min)(bestDuration, tEnd - tBegin);
+    }
+    return bestDuration;
+}
+
+// Calculates clock resolution once, and remembers the result
+Clock::duration clockResolution() noexcept {
+    static Clock::duration const sResolution = calcClockResolution(20);
+    return sResolution;
+}
+
+ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
+struct IterationLogic::Impl {
+    enum class State { warmup, upscaling_runtime, measuring, endless };
+
+    explicit Impl(Bench const& bench)
+        : mBench(bench)
+        , mResult(bench.config()) {
+        printStabilityInformationOnce(mBench.output());
+
+        // determine target runtime per epoch
+        mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
+        if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
+            mTargetRuntimePerEpoch = mBench.maxEpochTime();
+        }
+        if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
+            mTargetRuntimePerEpoch = mBench.minEpochTime();
+        }
+
+        if (isEndlessRunning(mBench.name())) {
+            std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
+            mNumIters = (std::numeric_limits<uint64_t>::max)();
+            mState = State::endless;
+        } else if (0 != mBench.warmup()) {
+            mNumIters = mBench.warmup();
+            mState = State::warmup;
+        } else if (0 != mBench.epochIterations()) {
+            // exact number of iterations
+            mNumIters = mBench.epochIterations();
+            mState = State::measuring;
+        } else {
+            mNumIters = mBench.minEpochIterations();
+            mState = State::upscaling_runtime;
+        }
+    }
+
+    // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
+    ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
+        auto doubleElapsed = d(elapsed);
+        auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
+        auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
+
+        auto doubleMinEpochIters = d(mBench.minEpochIterations());
+        if (doubleNewIters < doubleMinEpochIters) {
+            doubleNewIters = doubleMinEpochIters;
+        }
+        doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
+
+        // +0.5 for correct rounding when casting
+        // NOLINTNEXTLINE(bugprone-incorrect-roundings)
+        return static_cast<uint64_t>(doubleNewIters + 0.5);
+    }
+
+    ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
+        if (elapsed * 10 < mTargetRuntimePerEpoch) {
+            // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
+            if (mNumIters * 10 < mNumIters) {
+                // overflow :-(
+                showResult("iterations overflow. Maybe your code got optimized away?");
+                mNumIters = 0;
+                return;
+            }
+            mNumIters *= 10;
+        } else {
+            mNumIters = calcBestNumIters(elapsed, mNumIters);
+        }
+    }
+
+    void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
+#    if defined(ANKERL_NANOBENCH_LOG_ENABLED)
+        auto oldIters = mNumIters;
+#    endif
+
+        switch (mState) {
+        case State::warmup:
+            if (isCloseEnoughForMeasurements(elapsed)) {
+                // if elapsed is close enough, we can skip upscaling and go right to measurements
+                // still, we don't add the result to the measurements.
+                mState = State::measuring;
+                mNumIters = calcBestNumIters(elapsed, mNumIters);
+            } else {
+                // not close enough: switch to upscaling
+                mState = State::upscaling_runtime;
+                upscale(elapsed);
+            }
+            break;
+
+        case State::upscaling_runtime:
+            if (isCloseEnoughForMeasurements(elapsed)) {
+                // if we are close enough, add measurement and switch to always measuring
+                mState = State::measuring;
+                mTotalElapsed += elapsed;
+                mTotalNumIters += mNumIters;
+                mResult.add(elapsed, mNumIters, pc);
+                mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
+            } else {
+                upscale(elapsed);
+            }
+            break;
+
+        case State::measuring:
+            // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
+            // that fluctuation, or else we would bias the result
+            mTotalElapsed += elapsed;
+            mTotalNumIters += mNumIters;
+            mResult.add(elapsed, mNumIters, pc);
+            if (0 != mBench.epochIterations()) {
+                mNumIters = mBench.epochIterations();
+            } else {
+                mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
+            }
+            break;
+
+        case State::endless:
+            mNumIters = (std::numeric_limits<uint64_t>::max)();
+            break;
+        }
+
+        if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
+            // we got all the results that we need, finish it
+            showResult("");
+            mNumIters = 0;
+        }
+
+        ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, d(elapsed.count())) << " elapsed, "
+                                           << detail::fmt::Number(20, 3, d(mTargetRuntimePerEpoch.count())) << " target. oldIters="
+                                           << oldIters << ", mNumIters=" << mNumIters << ", mState=" << static_cast<int>(mState));
+    }
+
+    // NOLINTNEXTLINE(readability-function-cognitive-complexity)
+    void showResult(std::string const& errorMessage) const {
+        ANKERL_NANOBENCH_LOG(errorMessage);
+
+        if (mBench.output() != nullptr) {
+            // prepare column data ///////
+            std::vector<fmt::MarkDownColumn> columns;
+
+            auto rMedian = mResult.median(Result::Measure::elapsed);
+
+            if (mBench.relative()) {
+                double d = 100.0;
+                if (!mBench.results().empty()) {
+                    d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
+                }
+                columns.emplace_back(11, 1, "relative", "%", d);
+            }
+
+            if (mBench.complexityN() > 0) {
+                columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
+            }
+
+            columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
+                                 rMedian / (mBench.timeUnit().count() * mBench.batch()));
+            columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
+
+            double const rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
+            columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
+
+            double rInsMedian = -1.0;
+            if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {
+                rInsMedian = mResult.median(Result::Measure::instructions);
+                columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
+            }
+
+            double rCycMedian = -1.0;
+            if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {
+                rCycMedian = mResult.median(Result::Measure::cpucycles);
+                columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
+            }
+            if (rInsMedian > 0.0 && rCycMedian > 0.0) {
+                columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
+            }
+            if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {
+                double const rBraMedian = mResult.median(Result::Measure::branchinstructions);
+                columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
+                if (mResult.has(Result::Measure::branchmisses)) {
+                    double p = 0.0;
+                    if (rBraMedian >= 1e-9) {
+                        p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
+                    }
+                    columns.emplace_back(10, 1, "miss%", "%", p);
+                }
+            }
+
+            columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
+
+            // write everything
+            auto& os = *mBench.output();
+
+            // combine all elements that are relevant for printing the header
+            uint64_t hash = 0;
+            hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
+            hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
+            hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
+            hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
+            hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
+            hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
+
+            if (hash != singletonHeaderHash()) {
+                singletonHeaderHash() = hash;
+
+                // no result yet, print header
+                os << std::endl;
+                for (auto const& col : columns) {
+                    os << col.title();
+                }
+                os << "| " << mBench.title() << std::endl;
+
+                for (auto const& col : columns) {
+                    os << col.separator();
+                }
+                os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
+            }
+
+            if (!errorMessage.empty()) {
+                for (auto const& col : columns) {
+                    os << col.invalid();
+                }
+                os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
+            } else {
+                for (auto const& col : columns) {
+                    os << col.value();
+                }
+                os << "| ";
+                auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
+                if (showUnstable) {
+                    os << ":wavy_dash: ";
+                }
+                os << fmt::MarkDownCode(mBench.name());
+                if (showUnstable) {
+                    auto avgIters = d(mTotalNumIters) / d(mBench.epochs());
+                    // NOLINTNEXTLINE(bugprone-incorrect-roundings)
+                    auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
+
+                    os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
+                       << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
+                }
+                os << std::endl;
+            }
+        }
+    }
+
+    ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
+        return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
+    }
+
+    uint64_t mNumIters = 1;                            // NOLINT(misc-non-private-member-variables-in-classes)
+    Bench const& mBench;                               // NOLINT(misc-non-private-member-variables-in-classes)
+    std::chrono::nanoseconds mTargetRuntimePerEpoch{}; // NOLINT(misc-non-private-member-variables-in-classes)
+    Result mResult;                                    // NOLINT(misc-non-private-member-variables-in-classes)
+    Rng mRng{123};                                     // NOLINT(misc-non-private-member-variables-in-classes)
+    std::chrono::nanoseconds mTotalElapsed{};          // NOLINT(misc-non-private-member-variables-in-classes)
+    uint64_t mTotalNumIters = 0;                       // NOLINT(misc-non-private-member-variables-in-classes)
+    State mState = State::upscaling_runtime;           // NOLINT(misc-non-private-member-variables-in-classes)
+};
+ANKERL_NANOBENCH(IGNORE_PADDED_POP)
+
+IterationLogic::IterationLogic(Bench const& bench)
+    : mPimpl(new Impl(bench)) {}
+
+IterationLogic::~IterationLogic() {
+    delete mPimpl;
+}
+
+uint64_t IterationLogic::numIters() const noexcept {
+    ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
+    return mPimpl->mNumIters;
+}
+
+void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
+    mPimpl->add(elapsed, pc);
+}
+
+void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
+    results.emplace_back(std::move(mPimpl->mResult));
+}
+
+#    if ANKERL_NANOBENCH(PERF_COUNTERS)
+
+ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
+class LinuxPerformanceCounters {
+public:
+    struct Target {
+        Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
+            : targetValue(targetValue_)
+            , correctMeasuringOverhead(correctMeasuringOverhead_)
+            , correctLoopOverhead(correctLoopOverhead_) {}
+
+        uint64_t* targetValue{};         // NOLINT(misc-non-private-member-variables-in-classes)
+        bool correctMeasuringOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
+        bool correctLoopOverhead{};      // NOLINT(misc-non-private-member-variables-in-classes)
+    };
+
+    LinuxPerformanceCounters() = default;
+    LinuxPerformanceCounters(LinuxPerformanceCounters const&) = delete;
+    LinuxPerformanceCounters(LinuxPerformanceCounters&&) = delete;
+    LinuxPerformanceCounters& operator=(LinuxPerformanceCounters const&) = delete;
+    LinuxPerformanceCounters& operator=(LinuxPerformanceCounters&&) = delete;
+    ~LinuxPerformanceCounters();
+
+    // quick operation
+    inline void start() {}
+
+    inline void stop() {}
+
+    bool monitor(perf_sw_ids swId, Target target);
+    bool monitor(perf_hw_id hwId, Target target);
+
+    ANKERL_NANOBENCH(NODISCARD) bool hasError() const noexcept {
+        return mHasError;
+    }
+
+    // Just reading data is faster than enable & disabling.
+    // we subtract data ourselves.
+    inline void beginMeasure() {
+        if (mHasError) {
+            return;
+        }
+
+        // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
+        mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
+        if (mHasError) {
+            return;
+        }
+
+        // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
+        mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
+    }
+
+    inline void endMeasure() {
+        if (mHasError) {
+            return;
+        }
+
+        // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
+        mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
+        if (mHasError) {
+            return;
+        }
+
+        auto const numBytes = sizeof(uint64_t) * mCounters.size();
+        auto ret = read(mFd, mCounters.data(), numBytes);
+        mHasError = ret != static_cast<ssize_t>(numBytes);
+    }
+
+    void updateResults(uint64_t numIters);
+
+    // rounded integer division
+    template <typename T>
+    static inline T divRounded(T a, T divisor) {
+        return (a + divisor / 2) / divisor;
+    }
+
+    ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
+    static inline uint32_t mix(uint32_t x) noexcept {
+        x ^= x << 13U;
+        x ^= x >> 17U;
+        x ^= x << 5U;
+        return x;
+    }
+
+    template <typename Op>
+    ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
+    void calibrate(Op&& op) {
+        // clear current calibration data,
+        for (auto& v : mCalibratedOverhead) {
+            v = UINT64_C(0);
+        }
+
+        // create new calibration data
+        auto newCalibration = mCalibratedOverhead;
+        for (auto& v : newCalibration) {
+            v = (std::numeric_limits<uint64_t>::max)();
+        }
+        for (size_t iter = 0; iter < 100; ++iter) {
+            beginMeasure();
+            op();
+            endMeasure();
+            if (mHasError) {
+                return;
+            }
+
+            for (size_t i = 0; i < newCalibration.size(); ++i) {
+                auto diff = mCounters[i];
+                if (newCalibration[i] > diff) {
+                    newCalibration[i] = diff;
+                }
+            }
+        }
+
+        mCalibratedOverhead = std::move(newCalibration);
+
+        {
+            // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
+            // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
+            // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
+            // see https://godbolt.org/z/49RVQ5
+            uint64_t const numIters = 100000U + (std::random_device{}() & 3U);
+            uint64_t n = numIters;
+            uint32_t x = 1234567;
+
+            beginMeasure();
+            while (n-- > 0) {
+                x = mix(x);
+            }
+            endMeasure();
+            detail::doNotOptimizeAway(x);
+            auto measure1 = mCounters;
+
+            n = numIters;
+            beginMeasure();
+            while (n-- > 0) {
+                // we now run *twice* so we can easily calculate the overhead
+                x = mix(x);
+                x = mix(x);
+            }
+            endMeasure();
+            detail::doNotOptimizeAway(x);
+            auto measure2 = mCounters;
+
+            for (size_t i = 0; i < mCounters.size(); ++i) {
+                // factor 2 because we have two instructions per loop
+                auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
+                auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
+                auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
+
+                mLoopOverhead[i] = divRounded(overhead, numIters);
+            }
+        }
+    }
+
+private:
+    bool monitor(uint32_t type, uint64_t eventid, Target target);
+
+    std::map<uint64_t, Target> mIdToTarget{};
+
+    // start with minimum size of 3 for read_format
+    std::vector<uint64_t> mCounters{3};
+    std::vector<uint64_t> mCalibratedOverhead{3};
+    std::vector<uint64_t> mLoopOverhead{3};
+
+    uint64_t mTimeEnabledNanos = 0;
+    uint64_t mTimeRunningNanos = 0;
+    int mFd = -1;
+    bool mHasError = false;
+};
+ANKERL_NANOBENCH(IGNORE_PADDED_POP)
+
+LinuxPerformanceCounters::~LinuxPerformanceCounters() {
+    if (-1 != mFd) {
+        close(mFd);
+    }
+}
+
+bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
+    return monitor(PERF_TYPE_SOFTWARE, swId, target);
+}
+
+bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
+    return monitor(PERF_TYPE_HARDWARE, hwId, target);
+}
+
+// overflow is ok, it's checked
+ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
+void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
+    // clear old data
+    for (auto& id_value : mIdToTarget) {
+        *id_value.second.targetValue = UINT64_C(0);
+    }
+
+    if (mHasError) {
+        return;
+    }
+
+    mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
+    mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
+
+    for (uint64_t i = 0; i < mCounters[0]; ++i) {
+        auto idx = static_cast<size_t>(3 + i * 2 + 0);
+        auto id = mCounters[idx + 1U];
+
+        auto it = mIdToTarget.find(id);
+        if (it != mIdToTarget.end()) {
+
+            auto& tgt = it->second;
+            *tgt.targetValue = mCounters[idx];
+            if (tgt.correctMeasuringOverhead) {
+                if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
+                    *tgt.targetValue -= mCalibratedOverhead[idx];
+                } else {
+                    *tgt.targetValue = 0U;
+                }
+            }
+            if (tgt.correctLoopOverhead) {
+                auto correctionVal = mLoopOverhead[idx] * numIters;
+                if (*tgt.targetValue >= correctionVal) {
+                    *tgt.targetValue -= correctionVal;
+                } else {
+                    *tgt.targetValue = 0U;
+                }
+            }
+        }
+    }
+}
+
+bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
+    *target.targetValue = (std::numeric_limits<uint64_t>::max)();
+    if (mHasError) {
+        return false;
+    }
+
+    auto pea = perf_event_attr();
+    std::memset(&pea, 0, sizeof(perf_event_attr));
+    pea.type = type;
+    pea.size = sizeof(perf_event_attr);
+    pea.config = eventid;
+    pea.disabled = 1; // start counter as disabled
+    pea.exclude_kernel = 1;
+    pea.exclude_hv = 1;
+
+    // NOLINTNEXTLINE(hicpp-signed-bitwise)
+    pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+    const int pid = 0;                    // the current process
+    const int cpu = -1;                   // all CPUs
+#        if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
+    const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
+#        else
+    const unsigned long flags = 0;
+#        endif
+
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
+    auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
+    if (-1 == fd) {
+        return false;
+    }
+    if (-1 == mFd) {
+        // first call: set to fd, and use this from now on
+        mFd = fd;
+    }
+    uint64_t id = 0;
+    // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
+    if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
+        // couldn't get id
+        return false;
+    }
+
+    // insert into map, rely on the fact that map's references are constant.
+    mIdToTarget.emplace(id, target);
+
+    // prepare readformat with the correct size (after the insert)
+    auto size = 3 + 2 * mIdToTarget.size();
+    mCounters.resize(size);
+    mCalibratedOverhead.resize(size);
+    mLoopOverhead.resize(size);
+
+    return true;
+}
+
+PerformanceCounters::PerformanceCounters()
+    : mPc(new LinuxPerformanceCounters())
+    , mVal()
+    , mHas() {
+
+    // HW events
+    mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
+    if (!mHas.cpuCycles) {
+        // Fallback to cycles counter, reference cycles not available in many systems.
+        mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
+    }
+    mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
+    mHas.branchInstructions =
+        mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
+    mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
+    // mHas.branchMisses = false;
+
+    // SW events
+    mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
+    mHas.contextSwitches =
+        mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
+
+    mPc->start();
+    mPc->calibrate([] {
+        auto before = ankerl::nanobench::Clock::now();
+        auto after = ankerl::nanobench::Clock::now();
+        (void)before;
+        (void)after;
+    });
+
+    if (mPc->hasError()) {
+        // something failed, don't monitor anything.
+        mHas = PerfCountSet<bool>{};
+    }
+}
+
+PerformanceCounters::~PerformanceCounters() {
+    // no need to check for nullptr, delete nullptr has no effect
+    delete mPc;
+}
+
+void PerformanceCounters::beginMeasure() {
+    mPc->beginMeasure();
+}
+
+void PerformanceCounters::endMeasure() {
+    mPc->endMeasure();
+}
+
+void PerformanceCounters::updateResults(uint64_t numIters) {
+    mPc->updateResults(numIters);
+}
+
+#    else
+
+PerformanceCounters::PerformanceCounters() = default;
+PerformanceCounters::~PerformanceCounters() = default;
+void PerformanceCounters::beginMeasure() {}
+void PerformanceCounters::endMeasure() {}
+void PerformanceCounters::updateResults(uint64_t) {}
+
+#    endif
+
+ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
+    return mVal;
+}
+ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
+    return mHas;
+}
+
+// formatting utilities
+namespace fmt {
+
+// adds thousands separator to numbers
+NumSep::NumSep(char sep)
+    : mSep(sep) {}
+
+char NumSep::do_thousands_sep() const {
+    return mSep;
+}
+
+std::string NumSep::do_grouping() const {
+    return "\003";
+}
+
+// RAII to save & restore a stream's state
+StreamStateRestorer::StreamStateRestorer(std::ostream& s)
+    : mStream(s)
+    , mLocale(s.getloc())
+    , mPrecision(s.precision())
+    , mWidth(s.width())
+    , mFill(s.fill())
+    , mFmtFlags(s.flags()) {}
+
+StreamStateRestorer::~StreamStateRestorer() {
+    restore();
+}
+
+// sets back all stream info that we remembered at construction
+void StreamStateRestorer::restore() {
+    mStream.imbue(mLocale);
+    mStream.precision(mPrecision);
+    mStream.width(mWidth);
+    mStream.fill(mFill);
+    mStream.flags(mFmtFlags);
+}
+
+Number::Number(int width, int precision, int64_t value)
+    : mWidth(width)
+    , mPrecision(precision)
+    , mValue(d(value)) {}
+
+Number::Number(int width, int precision, double value)
+    : mWidth(width)
+    , mPrecision(precision)
+    , mValue(value) {}
+
+std::ostream& Number::write(std::ostream& os) const {
+    StreamStateRestorer const restorer(os);
+    os.imbue(std::locale(os.getloc(), new NumSep(',')));
+    os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
+    return os;
+}
+
+std::string Number::to_s() const {
+    std::stringstream ss;
+    write(ss);
+    return ss.str();
+}
+
+std::string to_s(uint64_t n) {
+    std::string str;
+    do {
+        str += static_cast<char>('0' + static_cast<char>(n % 10));
+        n /= 10;
+    } while (n != 0);
+    std::reverse(str.begin(), str.end());
+    return str;
+}
+
+std::ostream& operator<<(std::ostream& os, Number const& n) {
+    return n.write(os);
+}
+
+MarkDownColumn::MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept
+    : mWidth(w)
+    , mPrecision(prec)
+    , mTitle(std::move(tit))
+    , mSuffix(std::move(suff))
+    , mValue(val) {}
+
+std::string MarkDownColumn::title() const {
+    std::stringstream ss;
+    ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
+    return ss.str();
+}
+
+std::string MarkDownColumn::separator() const {
+    std::string sep(static_cast<size_t>(mWidth), '-');
+    sep.front() = '|';
+    sep.back() = ':';
+    return sep;
+}
+
+std::string MarkDownColumn::invalid() const {
+    std::string sep(static_cast<size_t>(mWidth), ' ');
+    sep.front() = '|';
+    sep[sep.size() - 2] = '-';
+    return sep;
+}
+
+std::string MarkDownColumn::value() const {
+    std::stringstream ss;
+    auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
+    ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
+    return ss.str();
+}
+
+// Formats any text as markdown code, escaping backticks.
+MarkDownCode::MarkDownCode(std::string const& what) {
+    mWhat.reserve(what.size() + 2);
+    mWhat.push_back('`');
+    for (char const c : what) {
+        mWhat.push_back(c);
+        if ('`' == c) {
+            mWhat.push_back('`');
+        }
+    }
+    mWhat.push_back('`');
+}
+
+std::ostream& MarkDownCode::write(std::ostream& os) const {
+    return os << mWhat;
+}
+
+std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
+    return mdCode.write(os);
+}
+} // namespace fmt
+} // namespace detail
+
+// provide implementation here so it's only generated once
+Config::Config() = default;
+Config::~Config() = default;
+Config& Config::operator=(Config const&) = default;
+Config& Config::operator=(Config&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
+Config::Config(Config const&) = default;
+Config::Config(Config&&) noexcept = default;
+
+// provide implementation here so it's only generated once
+Result::~Result() = default;
+Result& Result::operator=(Result const&) = default;
+Result& Result::operator=(Result&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
+Result::Result(Result const&) = default;
+Result::Result(Result&&) noexcept = default;
+
+namespace detail {
+template <typename T>
+inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
+    return static_cast<typename std::underlying_type<T>::type>(val);
+}
+} // namespace detail
+
+// Result returned after a benchmark has finished. Can be used as a baseline for relative().
+Result::Result(Config benchmarkConfig)
+    : mConfig(std::move(benchmarkConfig))
+    , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
+
+void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
+    using detail::d;
+    using detail::u;
+
+    double const dIters = d(iters);
+    mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
+
+    mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
+    if (pc.has().pageFaults) {
+        mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
+    }
+    if (pc.has().cpuCycles) {
+        mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
+    }
+    if (pc.has().contextSwitches) {
+        mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
+    }
+    if (pc.has().instructions) {
+        mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
+    }
+    if (pc.has().branchInstructions) {
+        double branchInstructions = 0.0;
+        // correcting branches: remove branch introduced by the while (...) loop for each iteration.
+        if (pc.val().branchInstructions > iters + 1U) {
+            branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
+        }
+        mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
+
+        if (pc.has().branchMisses) {
+            // correcting branch misses
+            double branchMisses = d(pc.val().branchMisses);
+            if (branchMisses > branchInstructions) {
+                // can't have branch misses when there were branches...
+                branchMisses = branchInstructions;
+            }
+
+            // assuming at least one missed branch for the loop
+            branchMisses -= 1.0;
+            if (branchMisses < 1.0) {
+                branchMisses = 1.0;
+            }
+            mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
+        }
+    }
+}
+
+Config const& Result::config() const noexcept {
+    return mConfig;
+}
+
+inline double calcMedian(std::vector<double>& data) {
+    if (data.empty()) {
+        return 0.0;
+    }
+    std::sort(data.begin(), data.end());
+
+    auto midIdx = data.size() / 2U;
+    if (1U == (data.size() & 1U)) {
+        return data[midIdx];
+    }
+    return (data[midIdx - 1U] + data[midIdx]) / 2U;
+}
+
+double Result::median(Measure m) const {
+    // create a copy so we can sort
+    auto data = mNameToMeasurements[detail::u(m)];
+    return calcMedian(data);
+}
+
+double Result::average(Measure m) const {
+    using detail::d;
+    auto const& data = mNameToMeasurements[detail::u(m)];
+    if (data.empty()) {
+        return 0.0;
+    }
+
+    // create a copy so we can sort
+    return sum(m) / d(data.size());
+}
+
+double Result::medianAbsolutePercentError(Measure m) const {
+    // create copy
+    auto data = mNameToMeasurements[detail::u(m)];
+
+    // calculates MdAPE which is the median of percentage error
+    // see https://support.numxl.com/hc/en-us/articles/115001223503-MdAPE-Median-Absolute-Percentage-Error
+    auto med = calcMedian(data);
+
+    // transform the data to absolute error
+    for (auto& x : data) {
+        x = (x - med) / x;
+        if (x < 0) {
+            x = -x;
+        }
+    }
+    return calcMedian(data);
+}
+
+double Result::sum(Measure m) const noexcept {
+    auto const& data = mNameToMeasurements[detail::u(m)];
+    return std::accumulate(data.begin(), data.end(), 0.0);
+}
+
+double Result::sumProduct(Measure m1, Measure m2) const noexcept {
+    auto const& data1 = mNameToMeasurements[detail::u(m1)];
+    auto const& data2 = mNameToMeasurements[detail::u(m2)];
+
+    if (data1.size() != data2.size()) {
+        return 0.0;
+    }
+
+    double result = 0.0;
+    for (size_t i = 0, s = data1.size(); i != s; ++i) {
+        result += data1[i] * data2[i];
+    }
+    return result;
+}
+
+bool Result::has(Measure m) const noexcept {
+    return !mNameToMeasurements[detail::u(m)].empty();
+}
+
+double Result::get(size_t idx, Measure m) const {
+    auto const& data = mNameToMeasurements[detail::u(m)];
+    return data.at(idx);
+}
+
+bool Result::empty() const noexcept {
+    return 0U == size();
+}
+
+size_t Result::size() const noexcept {
+    auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
+    return data.size();
+}
+
+double Result::minimum(Measure m) const noexcept {
+    auto const& data = mNameToMeasurements[detail::u(m)];
+    if (data.empty()) {
+        return 0.0;
+    }
+
+    // here its save to assume that at least one element is there
+    return *std::min_element(data.begin(), data.end());
+}
+
+double Result::maximum(Measure m) const noexcept {
+    auto const& data = mNameToMeasurements[detail::u(m)];
+    if (data.empty()) {
+        return 0.0;
+    }
+
+    // here its save to assume that at least one element is there
+    return *std::max_element(data.begin(), data.end());
+}
+
+std::string const& Result::context(char const* variableName) const {
+    return mConfig.mContext.at(variableName);
+}
+
+std::string const& Result::context(std::string const& variableName) const {
+    return mConfig.mContext.at(variableName);
+}
+
+Result::Measure Result::fromString(std::string const& str) {
+    if (str == "elapsed") {
+        return Measure::elapsed;
+    }
+    if (str == "iterations") {
+        return Measure::iterations;
+    }
+    if (str == "pagefaults") {
+        return Measure::pagefaults;
+    }
+    if (str == "cpucycles") {
+        return Measure::cpucycles;
+    }
+    if (str == "contextswitches") {
+        return Measure::contextswitches;
+    }
+    if (str == "instructions") {
+        return Measure::instructions;
+    }
+    if (str == "branchinstructions") {
+        return Measure::branchinstructions;
+    }
+    if (str == "branchmisses") {
+        return Measure::branchmisses;
+    }
+    // not found, return _size
+    return Measure::_size;
+}
+
+// Configuration of a microbenchmark.
+Bench::Bench() {
+    mConfig.mOut = &std::cout;
+}
+
+Bench::Bench(Bench&&) noexcept = default;
+Bench& Bench::operator=(Bench&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
+Bench::Bench(Bench const&) = default;
+Bench& Bench::operator=(Bench const&) = default;
+Bench::~Bench() noexcept = default;
+
+double Bench::batch() const noexcept {
+    return mConfig.mBatch;
+}
+
+double Bench::complexityN() const noexcept {
+    return mConfig.mComplexityN;
+}
+
+// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
+// means it is slower than the baseline.
+Bench& Bench::relative(bool isRelativeEnabled) noexcept {
+    mConfig.mIsRelative = isRelativeEnabled;
+    return *this;
+}
+bool Bench::relative() const noexcept {
+    return mConfig.mIsRelative;
+}
+
+Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
+    mConfig.mShowPerformanceCounters = showPerformanceCounters;
+    return *this;
+}
+bool Bench::performanceCounters() const noexcept {
+    return mConfig.mShowPerformanceCounters;
+}
+
+// Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
+// If u differs from currently set unit, the stored results will be cleared.
+// Use singular (byte, not bytes).
+Bench& Bench::unit(char const* u) {
+    if (u != mConfig.mUnit) {
+        mResults.clear();
+    }
+    mConfig.mUnit = u;
+    return *this;
+}
+
+Bench& Bench::unit(std::string const& u) {
+    return unit(u.c_str());
+}
+
+std::string const& Bench::unit() const noexcept {
+    return mConfig.mUnit;
+}
+
+Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
+    mConfig.mTimeUnit = tu;
+    mConfig.mTimeUnitName = tuName;
+    return *this;
+}
+
+std::string const& Bench::timeUnitName() const noexcept {
+    return mConfig.mTimeUnitName;
+}
+
+std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
+    return mConfig.mTimeUnit;
+}
+
+// If benchmarkTitle differs from currently set title, the stored results will be cleared.
+Bench& Bench::title(const char* benchmarkTitle) {
+    if (benchmarkTitle != mConfig.mBenchmarkTitle) {
+        mResults.clear();
+    }
+    mConfig.mBenchmarkTitle = benchmarkTitle;
+    return *this;
+}
+Bench& Bench::title(std::string const& benchmarkTitle) {
+    if (benchmarkTitle != mConfig.mBenchmarkTitle) {
+        mResults.clear();
+    }
+    mConfig.mBenchmarkTitle = benchmarkTitle;
+    return *this;
+}
+
+std::string const& Bench::title() const noexcept {
+    return mConfig.mBenchmarkTitle;
+}
+
+Bench& Bench::name(const char* benchmarkName) {
+    mConfig.mBenchmarkName = benchmarkName;
+    return *this;
+}
+
+Bench& Bench::name(std::string const& benchmarkName) {
+    mConfig.mBenchmarkName = benchmarkName;
+    return *this;
+}
+
+std::string const& Bench::name() const noexcept {
+    return mConfig.mBenchmarkName;
+}
+
+Bench& Bench::context(char const* variableName, char const* variableValue) {
+    mConfig.mContext[variableName] = variableValue;
+    return *this;
+}
+
+Bench& Bench::context(std::string const& variableName, std::string const& variableValue) {
+    mConfig.mContext[variableName] = variableValue;
+    return *this;
+}
+
+Bench& Bench::clearContext() {
+    mConfig.mContext.clear();
+    return *this;
+}
+
+// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
+Bench& Bench::epochs(size_t numEpochs) noexcept {
+    mConfig.mNumEpochs = numEpochs;
+    return *this;
+}
+size_t Bench::epochs() const noexcept {
+    return mConfig.mNumEpochs;
+}
+
+// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
+Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
+    mConfig.mClockResolutionMultiple = multiple;
+    return *this;
+}
+size_t Bench::clockResolutionMultiple() const noexcept {
+    return mConfig.mClockResolutionMultiple;
+}
+
+// Sets the maximum time each epoch should take. Default is 100ms.
+Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
+    mConfig.mMaxEpochTime = t;
+    return *this;
+}
+std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
+    return mConfig.mMaxEpochTime;
+}
+
+// Sets the maximum time each epoch should take. Default is 100ms.
+Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
+    mConfig.mMinEpochTime = t;
+    return *this;
+}
+std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
+    return mConfig.mMinEpochTime;
+}
+
+Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
+    mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
+    return *this;
+}
+uint64_t Bench::minEpochIterations() const noexcept {
+    return mConfig.mMinEpochIterations;
+}
+
+Bench& Bench::epochIterations(uint64_t numIters) noexcept {
+    mConfig.mEpochIterations = numIters;
+    return *this;
+}
+uint64_t Bench::epochIterations() const noexcept {
+    return mConfig.mEpochIterations;
+}
+
+Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
+    mConfig.mWarmup = numWarmupIters;
+    return *this;
+}
+uint64_t Bench::warmup() const noexcept {
+    return mConfig.mWarmup;
+}
+
+Bench& Bench::config(Config const& benchmarkConfig) {
+    mConfig = benchmarkConfig;
+    return *this;
+}
+Config const& Bench::config() const noexcept {
+    return mConfig;
+}
+
+Bench& Bench::output(std::ostream* outstream) noexcept {
+    mConfig.mOut = outstream;
+    return *this;
+}
+
+ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
+    return mConfig.mOut;
+}
+
+std::vector<Result> const& Bench::results() const noexcept {
+    return mResults;
+}
+
+Bench& Bench::render(char const* templateContent, std::ostream& os) {
+    ::ankerl::nanobench::render(templateContent, *this, os);
+    return *this;
+}
+
+Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
+    ::ankerl::nanobench::render(templateContent, *this, os);
+    return *this;
+}
+
+std::vector<BigO> Bench::complexityBigO() const {
+    std::vector<BigO> bigOs;
+    auto rangeMeasure = BigO::collectRangeMeasure(mResults);
+    bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
+        return 1.0;
+    });
+    bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
+        return n;
+    });
+    bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
+        return std::log2(n);
+    });
+    bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
+        return n * std::log2(n);
+    });
+    bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
+        return n * n;
+    });
+    bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
+        return n * n * n;
+    });
+    std::sort(bigOs.begin(), bigOs.end());
+    return bigOs;
+}
+
+Rng::Rng()
+    : mX(0)
+    , mY(0) {
+    std::random_device rd;
+    std::uniform_int_distribution<uint64_t> dist;
+    do {
+        mX = dist(rd);
+        mY = dist(rd);
+    } while (mX == 0 && mY == 0);
+}
+
+ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
+uint64_t splitMix64(uint64_t& state) noexcept {
+    uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
+    z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
+    z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
+    return z ^ (z >> 31U);
+}
+
+// Seeded as described in romu paper (update april 2020)
+Rng::Rng(uint64_t seed) noexcept
+    : mX(splitMix64(seed))
+    , mY(splitMix64(seed)) {
+    for (size_t i = 0; i < 10; ++i) {
+        operator()();
+    }
+}
+
+// only internally used to copy the RNG.
+Rng::Rng(uint64_t x, uint64_t y) noexcept
+    : mX(x)
+    , mY(y) {}
+
+Rng Rng::copy() const noexcept {
+    return Rng{mX, mY};
+}
+
+Rng::Rng(std::vector<uint64_t> const& data)
+    : mX(0)
+    , mY(0) {
+    if (data.size() != 2) {
+        throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
+                                 detail::fmt::to_s(data.size()));
+    }
+    mX = data[0];
+    mY = data[1];
+}
+
+std::vector<uint64_t> Rng::state() const {
+    std::vector<uint64_t> data(2);
+    data[0] = mX;
+    data[1] = mY;
+    return data;
+}
+
+BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
+    BigO::RangeMeasure rangeMeasure;
+    for (auto const& result : results) {
+        if (result.config().mComplexityN > 0.0) {
+            rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
+        }
+    }
+    return rangeMeasure;
+}
+
+BigO::BigO(std::string bigOName, RangeMeasure const& rangeMeasure)
+    : mName(std::move(bigOName)) {
+
+    // estimate the constant factor
+    double sumRangeMeasure = 0.0;
+    double sumRangeRange = 0.0;
+
+    for (const auto& rm : rangeMeasure) {
+        sumRangeMeasure += rm.first * rm.second;
+        sumRangeRange += rm.first * rm.first;
+    }
+    mConstant = sumRangeMeasure / sumRangeRange;
+
+    // calculate root mean square
+    double err = 0.0;
+    double sumMeasure = 0.0;
+    for (const auto& rm : rangeMeasure) {
+        auto diff = mConstant * rm.first - rm.second;
+        err += diff * diff;
+
+        sumMeasure += rm.second;
+    }
+
+    auto n = detail::d(rangeMeasure.size());
+    auto mean = sumMeasure / n;
+    mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
+}
+
+BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
+    : BigO(std::string(bigOName), rangeMeasure) {}
+
+std::string const& BigO::name() const noexcept {
+    return mName;
+}
+
+double BigO::constant() const noexcept {
+    return mConstant;
+}
+
+double BigO::normalizedRootMeanSquare() const noexcept {
+    return mNormalizedRootMeanSquare;
+}
+
+bool BigO::operator<(BigO const& other) const noexcept {
+    return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
+}
+
+std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
+    return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
+}
+
+std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
+    detail::fmt::StreamStateRestorer const restorer(os);
+    os << std::endl << "|   coefficient |   err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
+    for (auto const& bigO : bigOs) {
+        os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
+        os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
+        os << "| " << bigO.name();
+        os << std::endl;
+    }
+    return os;
+}
+
+} // namespace nanobench
+} // namespace ankerl
+
+#endif // ANKERL_NANOBENCH_IMPLEMENT
+#endif // ANKERL_NANOBENCH_H_INCLUDED
diff --git a/bench/profile.sh b/bench/profile.sh
new file mode 100755
index 0000000..98e0016
--- /dev/null
+++ b/bench/profile.sh
@@ -0,0 +1,104 @@
+#!/bin/sh
+# Profile UDPspeeder on target hardware.
+# Usage: ./profile.sh [results_dir]
+#
+# Expects bench_udpspeeder_static and test_udpspeeder_static in the
+# same directory as this script (or current directory).
+# Outputs results to results_dir (default: ./profile_results/).
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" 2>/dev/null && pwd)" || SCRIPT_DIR="."
+RESULTS_DIR="${1:-./profile_results}"
+mkdir -p "$RESULTS_DIR"
+
+# Find binaries: same dir as script, then cwd
+find_bin() {
+    if [ -x "$SCRIPT_DIR/$1" ]; then echo "$SCRIPT_DIR/$1"
+    elif [ -x "./$1" ]; then echo "./$1"
+    else echo ""; fi
+}
+
+BENCH_BIN="$(find_bin bench_udpspeeder_static)"
+TEST_BIN="$(find_bin test_udpspeeder_static)"
+
+if [ -z "$BENCH_BIN" ]; then
+    echo "ERROR: bench_udpspeeder_static not found" >&2
+    exit 1
+fi
+
+echo "=== UDPspeeder Profiling ==="
+echo "Date: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
+echo "Host: $(hostname 2>/dev/null || echo unknown)"
+echo ""
+
+# --- System info ---
+INFO="$RESULTS_DIR/system_info.txt"
+{
+    echo "hostname: $(hostname 2>/dev/null || echo unknown)"
+    echo "date: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
+    echo "uname: $(uname -a)"
+    echo "arch: $(uname -m)"
+    echo ""
+
+    if [ -f /proc/cpuinfo ]; then
+        echo "--- /proc/cpuinfo (first core) ---"
+        # Print until first blank line (= first core only)
+        sed '/^$/q' /proc/cpuinfo
+        echo ""
+
+        # Core count
+        cores=$(grep -c '^processor' /proc/cpuinfo 2>/dev/null || echo "?")
+        echo "core_count: $cores"
+        echo ""
+    fi
+
+    # CPU frequency if available
+    if [ -d /sys/devices/system/cpu/cpu0/cpufreq ]; then
+        echo "--- cpufreq ---"
+        for f in scaling_cur_freq scaling_min_freq scaling_max_freq scaling_governor; do
+            p="/sys/devices/system/cpu/cpu0/cpufreq/$f"
+            [ -f "$p" ] && echo "$f: $(cat "$p")"
+        done
+        echo ""
+    fi
+} > "$INFO" 2>&1
+echo "System info:  $INFO"
+
+# --- Tests ---
+if [ -n "$TEST_BIN" ]; then
+    echo ""
+    echo "--- Running tests ---"
+    TEST_LOG="$RESULTS_DIR/test_output.txt"
+    if "$TEST_BIN" > "$TEST_LOG" 2>&1; then
+        echo "Tests: PASSED"
+    else
+        echo "Tests: FAILED (see $TEST_LOG)" >&2
+        cat "$TEST_LOG"
+        exit 1
+    fi
+else
+    echo "WARNING: test_udpspeeder_static not found, skipping tests" >&2
+fi
+
+# --- Benchmarks ---
+echo ""
+echo "--- Running benchmarks ---"
+
+# Human-readable output (tee to both console and file)
+BENCH_LOG="$RESULTS_DIR/bench_output.txt"
+"$BENCH_BIN" 2>&1 | tee "$BENCH_LOG"
+
+# JSON output for machine consumption
+BENCH_JSON="$RESULTS_DIR/bench_results.json"
+"$BENCH_BIN" --json 2>/dev/null
+if [ -f bench_results.json ]; then
+    mv bench_results.json "$BENCH_JSON"
+    echo ""
+    echo "JSON results: $BENCH_JSON"
+fi
+
+echo ""
+echo "=== Done ==="
+echo "Results in:   $RESULTS_DIR/"
+ls -la "$RESULTS_DIR/"
diff --git a/bench/test_crc32.cpp b/bench/test_crc32.cpp
new file mode 100644
index 0000000..7230ec4
--- /dev/null
+++ b/bench/test_crc32.cpp
@@ -0,0 +1,190 @@
+#include "bench_common.h"
+#include "crc32c.h"
+#include "crc32/Crc32.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#define TEST(name, expr) do { \
+    if (!(expr)) { printf("  FAIL: %s\n", name); failures++; } \
+    else { printf("  ok:   %s\n", name); } \
+} while(0)
+
+/* --- Old CRC32 (zlib polynomial) baseline regression anchor --- */
+
+static int test_crc32_old_known_answer() {
+    int failures = 0;
+
+    /* Standard test vector: CRC32 of "123456789" = 0xCBF43926 */
+    const char *tv = "123456789";
+    uint32_t got = crc32_fast(tv, 9);
+    char msg[128];
+    snprintf(msg, sizeof(msg), "crc32_old(\"123456789\") = 0x%08X (expected 0xCBF43926)", got);
+    TEST(msg, got == 0xCBF43926);
+
+    /* Empty input */
+    uint32_t empty = crc32_fast("", 0);
+    snprintf(msg, sizeof(msg), "crc32_old(\"\") = 0x%08X (expected 0x00000000)", empty);
+    TEST(msg, empty == 0x00000000);
+
+    return failures;
+}
+
+/* --- CRC32C (Castagnoli) known-answer tests --- */
+
+static int test_crc32c_known_answer() {
+    int failures = 0;
+    char msg[128];
+
+    /* IETF/SCTP standard test vector: CRC32C of "123456789" = 0xE3069283 */
+    const char *tv = "123456789";
+    uint32_t sw = crc32c_sw(tv, 9);
+    snprintf(msg, sizeof(msg), "crc32c_sw(\"123456789\") = 0x%08X (expected 0xE3069283)", sw);
+    TEST(msg, sw == 0xE3069283);
+
+    /* Empty input */
+    uint32_t empty = crc32c_sw("", 0);
+    snprintf(msg, sizeof(msg), "crc32c_sw(\"\") = 0x%08X (expected 0x00000000)", empty);
+    TEST(msg, empty == 0x00000000);
+
+    /* Dispatched version should agree */
+    uint32_t dispatched = crc32c(tv, 9);
+    snprintf(msg, sizeof(msg), "crc32c(\"123456789\") = 0x%08X (expected 0xE3069283)", dispatched);
+    TEST(msg, dispatched == 0xE3069283);
+
+    return failures;
+}
+
+/* --- Hardware vs software agreement --- */
+
+static int test_crc32c_hw_sw_agree() {
+    int failures = 0;
+    char msg[128];
+
+    if (!crc32c_has_hw()) {
+        printf("  skip: no CRC32C hardware support detected\n");
+        return 0;
+    }
+
+    /* Test across various sizes and data patterns */
+    for (int i = 0; i < bench_sizes_count; i++) {
+        size_t sz = bench_sizes[i];
+        char *buf = (char *)malloc(sz);
+        for (size_t j = 0; j < sz; j++)
+            buf[j] = (char)((j * 13 + 7) & 0xFF);
+
+        uint32_t sw = crc32c_sw(buf, sz);
+        uint32_t hw = crc32c_hw(buf, sz);
+
+        snprintf(msg, sizeof(msg), "crc32c hw==sw at %zu bytes (sw=0x%08X hw=0x%08X)",
+                 sz, sw, hw);
+        TEST(msg, sw == hw);
+
+        free(buf);
+    }
+
+    /* Also test odd sizes that stress alignment/tail handling */
+    int odd_sizes[] = {1, 3, 7, 15, 31, 63, 127, 255, 1023, 1499};
+    for (int s = 0; s < (int)(sizeof(odd_sizes)/sizeof(odd_sizes[0])); s++) {
+        int sz = odd_sizes[s];
+        char *buf = (char *)malloc(sz);
+        for (int j = 0; j < sz; j++)
+            buf[j] = (char)((j * 41 + 3) & 0xFF);
+
+        uint32_t sw = crc32c_sw(buf, sz);
+        uint32_t hw = crc32c_hw(buf, sz);
+
+        snprintf(msg, sizeof(msg), "crc32c hw==sw at %d bytes (odd)", sz);
+        TEST(msg, sw == hw);
+
+        free(buf);
+    }
+
+    return failures;
+}
+
+/* --- Incremental chaining --- */
+
+static int test_crc32c_chaining() {
+    int failures = 0;
+    char msg[128];
+    const int sz = 1024;
+    char buf[1024];
+
+    for (int i = 0; i < sz; i++)
+        buf[i] = (char)((i * 17 + 5) & 0xFF);
+
+    /* Full CRC in one shot */
+    uint32_t full = crc32c(buf, sz);
+
+    /* CRC in two halves, chained */
+    uint32_t first_half = crc32c(buf, sz / 2);
+    uint32_t chained = crc32c(buf + sz / 2, sz / 2, first_half);
+
+    snprintf(msg, sizeof(msg),
+             "crc32c chaining: full=0x%08X chained=0x%08X", full, chained);
+    TEST(msg, full == chained);
+
+    return failures;
+}
+
+static int test_crc32c_unaligned() {
+    int failures = 0;
+    char msg[128];
+
+    int sizes[] = {64, 256, 1500};
+    int offsets[] = {1, 3};
+
+    for (int si = 0; si < 3; si++) {
+        int sz = sizes[si];
+        /* Allocate with extra headroom for offsets */
+        char *raw = (char *)malloc(sz + 8);
+        for (int i = 0; i < sz + 8; i++)
+            raw[i] = (char)((i * 41 + 3) & 0xFF);
+
+        for (int oi = 0; oi < 2; oi++) {
+            int off = offsets[oi];
+            int len = sz - off;
+
+            uint32_t sw = crc32c_sw(raw + off, len);
+            uint32_t dispatched = crc32c(raw + off, len);
+
+            snprintf(msg, sizeof(msg),
+                     "crc32c unaligned off=%d len=%d: dispatched==sw (0x%08X)",
+                     off, len, sw);
+            TEST(msg, dispatched == sw);
+
+            if (crc32c_has_hw()) {
+                uint32_t hw = crc32c_hw(raw + off, len);
+                snprintf(msg, sizeof(msg),
+                         "crc32c unaligned off=%d len=%d: hw==sw (0x%08X)",
+                         off, len, sw);
+                TEST(msg, hw == sw);
+            }
+        }
+        free(raw);
+    }
+
+    return failures;
+}
+
+int run_crc32_tests() {
+    int failures = 0;
+
+    printf("[CRC32 old known-answer]\n");
+    failures += test_crc32_old_known_answer();
+
+    printf("[CRC32C known-answer]\n");
+    failures += test_crc32c_known_answer();
+
+    printf("[CRC32C hw vs sw agreement]\n");
+    failures += test_crc32c_hw_sw_agree();
+
+    printf("[CRC32C incremental chaining]\n");
+    failures += test_crc32c_chaining();
+
+    printf("[CRC32C unaligned input]\n");
+    failures += test_crc32c_unaligned();
+
+    return failures;
+}
diff --git a/bench/test_fec.cpp b/bench/test_fec.cpp
new file mode 100644
index 0000000..6687d13
--- /dev/null
+++ b/bench/test_fec.cpp
@@ -0,0 +1,263 @@
+#include "bench_common.h"
+#include "lib/rs.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#define TEST(name, expr) do { \
+    if (!(expr)) { printf("  FAIL: %s\n", name); failures++; } \
+    else { printf("  ok:   %s\n", name); } \
+} while(0)
+
+static void fill_pattern(char *buf, int sz, int seed) {
+    for (int i = 0; i < sz; i++)
+        buf[i] = (char)((i + seed) & 0xFF);
+}
+
+static int test_addmul1_identity() {
+    int failures = 0;
+    const int sz = 256;
+    gf dst[256], src[256], expected[256];
+
+    /* Multiply by 1: dst ^= src * 1 == dst ^= src */
+    memset(dst, 0, sz);
+    for (int i = 0; i < sz; i++) src[i] = (gf)(i & 0xFF);
+
+    bench_addmul1(dst, src, 1, sz);
+
+    TEST("addmul1(dst=0, src, c=1) == src", memcmp(dst, src, sz) == 0);
+
+    /* Multiply by 0: dst should be unchanged */
+    for (int i = 0; i < sz; i++) dst[i] = (gf)i;
+    memcpy(expected, dst, sz);
+
+    bench_addmul1(dst, src, 0, sz);
+
+    TEST("addmul1(dst, src, c=0) leaves dst unchanged", memcmp(dst, expected, sz) == 0);
+
+    return failures;
+}
+
+static int test_addmul1_linearity() {
+    int failures = 0;
+    const int sz = 256;
+    gf src[256], dst_a[256], dst_b[256], dst_ab[256];
+
+    for (int i = 0; i < sz; i++) src[i] = (gf)((i * 37 + 11) & 0xFF);
+
+    /* addmul1(c=a) then addmul1(c=b) should equal addmul1(c=a^b)
+     * in GF(2^8), addition is XOR, but multiplication distributes:
+     *   src*a XOR src*b == src*(a XOR b)  [only in GF(2^n)]
+     * We verify this by running both paths. */
+    gf a = 0x53, b = 0xCA;
+
+    memset(dst_a, 0, sz);
+    bench_addmul1(dst_a, src, a, sz);
+    bench_addmul1(dst_a, src, b, sz);
+
+    memset(dst_ab, 0, sz);
+    /* In GF(2^8), src*a ^ src*b = src*(a^b) */
+    bench_addmul1(dst_ab, src, a ^ b, sz);
+
+    TEST("addmul1 linearity: (src*a)^(src*b) == src*(a^b)",
+         memcmp(dst_a, dst_ab, sz) == 0);
+
+    return failures;
+}
+
+static int test_addmul1_sizes() {
+    int failures = 0;
+
+    /* Test that addmul1 works at all benchmark sizes (catches off-by-one in unrolling) */
+    for (int i = 0; i < bench_sizes_count; i++) {
+        int sz = (int)bench_sizes[i];
+        gf *dst = (gf *)calloc(sz, 1);
+        gf *src = (gf *)calloc(sz, 1);
+
+        for (int j = 0; j < sz; j++) src[j] = (gf)((j * 7) & 0xFF);
+        bench_addmul1(dst, src, 1, sz);
+
+        char name[64];
+        snprintf(name, sizeof(name), "addmul1 c=1 at %d bytes", sz);
+        TEST(name, memcmp(dst, src, sz) == 0);
+
+        free(dst);
+        free(src);
+    }
+    return failures;
+}
+
+static int test_rs_roundtrip(int k, int n, int pkt_sz) {
+    int failures = 0;
+    int redundant = n - k;
+    char label[64];
+    snprintf(label, sizeof(label), "rs round-trip k=%d n=%d sz=%d", k, n, pkt_sz);
+
+    /* Allocate and fill original data */
+    char **data = (char **)calloc(n, sizeof(char *));
+    char **orig = (char **)calloc(k, sizeof(char *));
+    for (int i = 0; i < n; i++)
+        data[i] = (char *)calloc(1, pkt_sz);
+    for (int i = 0; i < k; i++) {
+        fill_pattern(data[i], pkt_sz, i * 31);
+        orig[i] = (char *)calloc(1, pkt_sz);
+        memcpy(orig[i], data[i], pkt_sz);
+    }
+
+    /* Encode */
+    rs_encode2(k, n, data, pkt_sz);
+
+    /* Simulate losing the first 'redundant' data packets */
+    for (int i = 0; i < redundant; i++)
+        data[i] = NULL;
+
+    /* Decode */
+    int rc = rs_decode2(k, n, data, pkt_sz);
+    if (rc != 0) {
+        snprintf(label, sizeof(label), "rs_decode2 returned %d for k=%d n=%d", rc, k, n);
+        TEST(label, 0);
+        goto cleanup;
+    }
+
+    /* Verify recovered data matches originals */
+    for (int i = 0; i < k; i++) {
+        snprintf(label, sizeof(label), "rs data[%d] matches (k=%d n=%d)", i, k, n);
+        TEST(label, data[i] != NULL && memcmp(data[i], orig[i], pkt_sz) == 0);
+    }
+
+cleanup:
+    /* Free all non-null pointers in data[] (decode may have rearranged them) */
+    /* Since rs_decode reuses memory, we need to track what was allocated */
+    /* Simple approach: free orig separately, free remaining data bufs */
+    for (int i = 0; i < k; i++) free(orig[i]);
+    free(orig);
+    /* data[] pointers may alias the original allocations; the calloc'd buffers
+     * that weren't NULLed are still valid. We allocated n buffers initially,
+     * NULLed 'redundant' of them. The decode reused the non-null ones.
+     * Since we can't easily track which are unique, just leak here — it's a test. */
+    free(data);
+
+    return failures;
+}
+
+/*
+ * RS round-trip losing specific shard indices (instead of always the first r).
+ * lose_idx[0..lose_count-1] lists which of the n shards to NULL before decode.
+ */
+static int test_rs_roundtrip_pattern(int k, int n, int pkt_sz,
+                                     const int *lose_idx, int lose_count,
+                                     const char *pattern_name) {
+    int failures = 0;
+    int redundant = n - k;
+    char label[128];
+
+    if (lose_count > redundant) {
+        snprintf(label, sizeof(label), "rs k=%d n=%d %s: lose_count(%d) > redundant(%d)",
+                 k, n, pattern_name, lose_count, redundant);
+        TEST(label, 0);
+        return failures;
+    }
+
+    char **data = (char **)calloc(n, sizeof(char *));
+    char **orig = (char **)calloc(k, sizeof(char *));
+    for (int i = 0; i < n; i++)
+        data[i] = (char *)calloc(1, pkt_sz);
+    for (int i = 0; i < k; i++) {
+        fill_pattern(data[i], pkt_sz, i * 31);
+        orig[i] = (char *)calloc(1, pkt_sz);
+        memcpy(orig[i], data[i], pkt_sz);
+    }
+
+    rs_encode2(k, n, data, pkt_sz);
+
+    for (int i = 0; i < lose_count; i++)
+        data[lose_idx[i]] = NULL;
+
+    int rc = rs_decode2(k, n, data, pkt_sz);
+    snprintf(label, sizeof(label), "rs k=%d n=%d %s: decode ok", k, n, pattern_name);
+    if (rc != 0) {
+        TEST(label, 0);
+        goto cleanup;
+    }
+    TEST(label, 1);
+
+    for (int i = 0; i < k; i++) {
+        snprintf(label, sizeof(label), "rs k=%d n=%d %s: data[%d] matches",
+                 k, n, pattern_name, i);
+        TEST(label, data[i] != NULL && memcmp(data[i], orig[i], pkt_sz) == 0);
+    }
+
+cleanup:
+    for (int i = 0; i < k; i++) free(orig[i]);
+    free(orig);
+    free(data);
+    return failures;
+}
+
+int run_fec_tests() {
+    int failures = 0;
+
+    /* GF tables are initialized inside fec_new; force init via a dummy allocation */
+    void *dummy = fec_new(2, 3);
+    fec_free(dummy);
+
+    printf("[addmul1 identity]\n");
+    failures += test_addmul1_identity();
+
+    printf("[addmul1 linearity]\n");
+    failures += test_addmul1_linearity();
+
+    printf("[addmul1 sizes]\n");
+    failures += test_addmul1_sizes();
+
+    printf("[rs round-trip: lose first r]\n");
+    failures += test_rs_roundtrip(2, 4, 1500);
+    failures += test_rs_roundtrip(5, 8, 1500);
+    failures += test_rs_roundtrip(10, 15, 1024);
+
+    /* Additional k/n combos */
+    printf("[rs round-trip: more k/n combos]\n");
+    failures += test_rs_roundtrip(1, 2, 1500);
+    failures += test_rs_roundtrip(1, 3, 1500);
+    failures += test_rs_roundtrip(20, 30, 1024);
+    failures += test_rs_roundtrip(50, 75, 512);
+
+    /* Diverse loss patterns */
+    printf("[rs round-trip: lose last r]\n");
+    {
+        /* k=5 n=8: lose shards 5,6,7 (last 3) */
+        int lose[] = {5, 6, 7};
+        failures += test_rs_roundtrip_pattern(5, 8, 1500, lose, 3, "lose-last");
+    }
+    {
+        /* k=10 n=15: lose shards 10,11,12,13,14 */
+        int lose[] = {10, 11, 12, 13, 14};
+        failures += test_rs_roundtrip_pattern(10, 15, 1024, lose, 5, "lose-last");
+    }
+
+    printf("[rs round-trip: lose every-other]\n");
+    {
+        /* k=5 n=8: lose shards 0,2,4 (every other, 3 lost = r) */
+        int lose[] = {0, 2, 4};
+        failures += test_rs_roundtrip_pattern(5, 8, 1500, lose, 3, "lose-evens");
+    }
+    {
+        /* k=10 n=15: lose shards 1,3,5,7,9 (odd indices, 5 lost = r) */
+        int lose[] = {1, 3, 5, 7, 9};
+        failures += test_rs_roundtrip_pattern(10, 15, 1024, lose, 5, "lose-odds");
+    }
+
+    printf("[rs round-trip: lose middle]\n");
+    {
+        /* k=5 n=8: lose shards 2,3,4 (middle) */
+        int lose[] = {2, 3, 4};
+        failures += test_rs_roundtrip_pattern(5, 8, 1500, lose, 3, "lose-middle");
+    }
+    {
+        /* k=20 n=30: lose shards 5,10,15,20,25,6,11,16,21,26 (scattered) */
+        int lose[] = {5, 6, 10, 11, 15, 16, 20, 21, 25, 26};
+        failures += test_rs_roundtrip_pattern(20, 30, 512, lose, 10, "lose-scattered");
+    }
+
+    return failures;
+}
diff --git a/bench/test_main.cpp b/bench/test_main.cpp
new file mode 100644
index 0000000..90b3421
--- /dev/null
+++ b/bench/test_main.cpp
@@ -0,0 +1,23 @@
+#include "bench_common.h"
+#include <cstdio>
+
+int main() {
+    int failures = 0;
+
+    printf("=== FEC Tests ===\n");
+    failures += run_fec_tests();
+
+    printf("\n=== CRC32 Tests ===\n");
+    failures += run_crc32_tests();
+
+    printf("\n=== Packet Cook Tests ===\n");
+    failures += run_packet_tests();
+
+    printf("\n");
+    if (failures == 0)
+        printf("All tests passed.\n");
+    else
+        printf("%d test(s) FAILED.\n", failures);
+
+    return failures > 0 ? 1 : 0;
+}
diff --git a/bench/test_packet.cpp b/bench/test_packet.cpp
new file mode 100644
index 0000000..9259fba
--- /dev/null
+++ b/bench/test_packet.cpp
@@ -0,0 +1,275 @@
+#include "bench_common.h"
+#include "packet_cook.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+/* Stubs for packet_cook.cpp dependencies — production uses common.cpp */
+#ifndef BENCH_PACKET_STUBS_DEFINED
+#define BENCH_PACKET_STUBS_DEFINED
+void get_fake_random_chars(char *s, int len) {
+    for (int i = 0; i < len; i++)
+        s[i] = (char)(rand() & 0xFF);
+}
+
+int random_between(unsigned int a, unsigned int b) {
+    if (a == b) return (int)a;
+    return (int)(a + (unsigned int)rand() % (b + 1 - a));
+}
+#endif
+
+#define TEST(name, expr) do { \
+    if (!(expr)) { printf("  FAIL: %s\n", name); failures++; } \
+    else { printf("  ok:   %s\n", name); } \
+} while(0)
+
+static int test_cook_roundtrip() {
+    int failures = 0;
+    cook_ctx_t ctx = {};
+    strcpy(ctx.key, "testkey123");
+    cook_ctx_prepare_key(&ctx);
+    ctx.iv_min = 4;
+    ctx.iv_max = 32;
+
+    int sizes[] = { 1, 16, 64, 256, 1024, 1500 };
+    int nsizes = sizeof(sizes) / sizeof(sizes[0]);
+
+    for (int s = 0; s < nsizes; s++) {
+        int sz = sizes[s];
+        char orig[4096], buf[4096];
+
+        /* Fill with pattern */
+        for (int i = 0; i < sz; i++)
+            orig[i] = (char)((i * 37 + 11) & 0xFF);
+        memcpy(buf, orig, sz);
+
+        int len = sz;
+        int rc = do_cook(&ctx, buf, len);
+
+        char label[80];
+        snprintf(label, sizeof(label), "do_cook succeeds at %d bytes", sz);
+        TEST(label, rc == 0 && len > sz);
+
+        rc = de_cook(&ctx, buf, len);
+        snprintf(label, sizeof(label), "de_cook succeeds at %d bytes", sz);
+        TEST(label, rc == 0 && len == sz);
+
+        snprintf(label, sizeof(label), "round-trip data matches at %d bytes", sz);
+        TEST(label, memcmp(buf, orig, sz) == 0);
+    }
+
+    return failures;
+}
+
+static int test_cook_checksum_only() {
+    int failures = 0;
+    cook_ctx_t ctx = {};
+    ctx.iv_min = 4;
+    ctx.iv_max = 32;
+    ctx.disable_obscure = 1;
+    ctx.disable_xor = 1;
+
+    char orig[1600], buf[1600];
+    int sz = 100;
+    for (int i = 0; i < sz; i++) orig[i] = (char)i;
+    memcpy(buf, orig, sz);
+
+    int len = sz;
+    do_cook(&ctx, buf, len);
+    TEST("checksum adds 4 bytes", len == sz + 4);
+
+    int rc = de_cook(&ctx, buf, len);
+    TEST("checksum round-trip succeeds", rc == 0 && len == sz);
+    TEST("checksum data matches", memcmp(buf, orig, sz) == 0);
+
+    /* Corrupt a byte and verify detection */
+    memcpy(buf, orig, sz);
+    len = sz;
+    do_cook(&ctx, buf, len);
+    buf[0] ^= 0x01;
+    rc = de_cook(&ctx, buf, len);
+    TEST("checksum detects corruption", rc != 0);
+
+    return failures;
+}
+
+static int test_cook_disabled() {
+    int failures = 0;
+    cook_ctx_t ctx = {};
+    ctx.iv_min = 4;
+    ctx.iv_max = 32;
+    ctx.disable_checksum = 1;
+    ctx.disable_obscure = 1;
+    ctx.disable_xor = 1;
+
+    char buf[256];
+    int sz = 100;
+    for (int i = 0; i < sz; i++) buf[i] = (char)i;
+    char orig[256];
+    memcpy(orig, buf, sz);
+
+    int len = sz;
+    do_cook(&ctx, buf, len);
+    TEST("all disabled: length unchanged", len == sz);
+    TEST("all disabled: data unchanged", memcmp(buf, orig, sz) == 0);
+
+    return failures;
+}
+
+static int test_xor_tile_roundtrip() {
+    int failures = 0;
+    int vec_w = bench_cook_vec_width();
+    char label[128];
+
+    int tile_lens[] = {vec_w, vec_w * 2, vec_w * 5};
+    int num_tiles = 3;
+    int data_lens[] = {1, 7, 8, 15, 16, 31, 32, 63, 64, 1500};
+    int num_datas = 10;
+    int offsets[] = {0, 1, 3, 7};
+    int num_offsets = 4;
+
+    for (int tl = 0; tl < num_tiles; tl++) {
+        int tile_len = tile_lens[tl];
+        char tile[256];
+        for (int i = 0; i < tile_len; i++)
+            tile[i] = (char)((i * 37 + 11) & 0xFF);
+
+        for (int dl = 0; dl < num_datas; dl++) {
+            int data_len = data_lens[dl];
+            for (int ol = 0; ol < num_offsets; ol++) {
+                int offset = offsets[ol];
+                char backing[2048];
+                char orig[2048];
+                char *data = backing + offset;
+                for (int i = 0; i < data_len; i++)
+                    data[i] = (char)((i * 13 + 7) & 0xFF);
+                memcpy(orig, data, data_len);
+
+                /* XOR once should change data (tile is non-zero) */
+                bench_xor_tile(data, data_len, tile, tile_len);
+                int changed = (memcmp(data, orig, data_len) != 0);
+
+                /* XOR again should restore original */
+                bench_xor_tile(data, data_len, tile, tile_len);
+
+                snprintf(label, sizeof(label),
+                    "xor_tile tile=%d data=%d off=%d", tile_len, data_len, offset);
+                TEST(label, changed && memcmp(data, orig, data_len) == 0);
+            }
+        }
+    }
+    return failures;
+}
+
+static int test_cook_combo(int disable_checksum, int disable_obscure, int disable_xor,
+                           int sz) {
+    int failures = 0;
+    char label[128];
+    const char *cs = disable_checksum ? "off" : "on";
+    const char *ob = disable_obscure ? "off" : "on";
+    const char *xr = disable_xor ? "off" : "on";
+
+    cook_ctx_t ctx = {};
+    strcpy(ctx.key, "testkey123");
+    cook_ctx_prepare_key(&ctx);
+    ctx.iv_min = 4;
+    ctx.iv_max = 32;
+    ctx.disable_checksum = disable_checksum;
+    ctx.disable_obscure = disable_obscure;
+    ctx.disable_xor = disable_xor;
+
+    char orig[4096], buf[4096];
+    for (int i = 0; i < sz; i++)
+        orig[i] = (char)((i * 37 + 11) & 0xFF);
+    memcpy(buf, orig, sz);
+
+    int len = sz;
+    int rc = do_cook(&ctx, buf, len);
+
+    snprintf(label, sizeof(label), "cook cs=%s ob=%s xr=%s sz=%d: encode ok", cs, ob, xr, sz);
+    TEST(label, rc == 0);
+
+    rc = de_cook(&ctx, buf, len);
+    snprintf(label, sizeof(label), "cook cs=%s ob=%s xr=%s sz=%d: decode ok", cs, ob, xr, sz);
+    TEST(label, rc == 0 && len == sz);
+
+    snprintf(label, sizeof(label), "cook cs=%s ob=%s xr=%s sz=%d: data matches", cs, ob, xr, sz);
+    TEST(label, memcmp(buf, orig, sz) == 0);
+
+    return failures;
+}
+
+static int test_cook_all_combos() {
+    int failures = 0;
+    int sizes[] = {64, 1500};
+    for (int s = 0; s < 2; s++) {
+        for (int cs = 0; cs <= 1; cs++)
+            for (int ob = 0; ob <= 1; ob++)
+                for (int xr = 0; xr <= 1; xr++)
+                    failures += test_cook_combo(cs, ob, xr, sizes[s]);
+    }
+    return failures;
+}
+
+static int test_cook_unaligned() {
+    int failures = 0;
+    char label[128];
+    int offsets[] = {0, 1, 3, 5, 7};
+    int num_offsets = 5;
+    int sizes[] = {64, 256, 1500};
+    int nsizes = 3;
+
+    for (int ol = 0; ol < num_offsets; ol++) {
+        int offset = offsets[ol];
+        for (int s = 0; s < nsizes; s++) {
+            int sz = sizes[s];
+            /* +offset for misalignment, +200 for cook overhead */
+            char backing[4096];
+            char orig[4096];
+            char *buf = backing + offset;
+
+            cook_ctx_t ctx = {};
+            strcpy(ctx.key, "testkey123");
+            cook_ctx_prepare_key(&ctx);
+            ctx.iv_min = 4;
+            ctx.iv_max = 32;
+
+            for (int i = 0; i < sz; i++)
+                buf[i] = (char)((i * 37 + 11) & 0xFF);
+            memcpy(orig, buf, sz);
+
+            int len = sz;
+            do_cook(&ctx, buf, len);
+            int rc = de_cook(&ctx, buf, len);
+
+            snprintf(label, sizeof(label),
+                "cook unaligned off=%d sz=%d: round-trip", offset, sz);
+            TEST(label, rc == 0 && len == sz && memcmp(buf, orig, sz) == 0);
+        }
+    }
+    return failures;
+}
+
+int run_packet_tests() {
+    int failures = 0;
+
+    printf("[cook round-trip]\n");
+    failures += test_cook_roundtrip();
+
+    printf("[cook checksum only]\n");
+    failures += test_cook_checksum_only();
+
+    printf("[cook all disabled]\n");
+    failures += test_cook_disabled();
+
+    printf("[xor_tile round-trip]\n");
+    failures += test_xor_tile_roundtrip();
+
+    printf("[cook all 8 enable/disable combos]\n");
+    failures += test_cook_all_combos();
+
+    printf("[cook unaligned buffers]\n");
+    failures += test_cook_unaligned();
+
+    return failures;
+}
diff --git a/bench/throughput.sh b/bench/throughput.sh
new file mode 100755
index 0000000..8bf7e96
--- /dev/null
+++ b/bench/throughput.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+# bench/throughput.sh — Measure end-to-end UDP tunnel throughput
+#
+# Usage: ./bench/throughput.sh <speederv2_binary> [options]
+#   --duration N     seconds per iteration (default: 5)
+#   --fec X:Y        FEC parameter (default: disabled)
+#   --disable-fec    explicitly disable FEC
+#   --iterations N   number of runs, reports median (default: 3)
+#   --json           output JSON for github-action-benchmark
+
+set -euo pipefail
+
+BINARY=""
+DURATION=10
+FEC_ARGS="--disable-fec"
+FEC_LABEL="no-fec"
+ITERATIONS=5
+JSON=0
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --duration) DURATION="$2"; shift 2 ;;
+        --fec) FEC_ARGS="-f $2"; FEC_LABEL="fec-${2//:/-}"; shift 2 ;;
+        --disable-fec) FEC_ARGS="--disable-fec"; FEC_LABEL="no-fec"; shift ;;
+        --iterations) ITERATIONS="$2"; shift 2 ;;
+        --json) JSON=1; shift ;;
+        -*) echo "Unknown option: $1" >&2; exit 1 ;;
+        *) BINARY="$1"; shift ;;
+    esac
+done
+
+if [[ -z "$BINARY" ]]; then
+    echo "Usage: $0 <speederv2_binary> [--duration N] [--fec X:Y] [--json]" >&2
+    exit 1
+fi
+
+if [[ ! -x "$BINARY" ]]; then
+    echo "Error: $BINARY is not executable" >&2
+    exit 1
+fi
+
+PORT_TUNNEL=20000
+PORT_APP=20001
+PORT_CLIENT=20002
+
+# Kill any leftover processes from previous runs
+kill_tunnel() {
+    local pids
+    pids=$(jobs -p 2>/dev/null) || true
+    if [[ -n "$pids" ]]; then
+        kill $pids 2>/dev/null || true
+        wait $pids 2>/dev/null || true
+    fi
+}
+trap kill_tunnel EXIT
+
+run_once() {
+    local tmpfile
+    tmpfile=$(mktemp)
+    kill_tunnel
+
+    # UDP receiver: writes "bytes elapsed" to tmpfile, exits after 2s of no data
+    python3 -c "
+import socket, time, sys
+sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+sock.bind(('127.0.0.1', $PORT_APP))
+sock.settimeout(2)
+total = 0
+start = None
+try:
+    while True:
+        data = sock.recv(65535)
+        if start is None:
+            start = time.monotonic()
+        total += len(data)
+except socket.timeout:
+    pass
+elapsed = time.monotonic() - start if start else 0
+result = f'{total} {elapsed:.6f}'
+sys.stdout.write(result + '\n')
+sys.stdout.flush()
+" > "$tmpfile" 2>/dev/null &
+    local recv_pid=$!
+
+    # Start tunnel
+    $BINARY -s -l 127.0.0.1:$PORT_TUNNEL -r 127.0.0.1:$PORT_APP $FEC_ARGS --log-level 0 >/dev/null 2>&1 &
+    local server_pid=$!
+
+    $BINARY -c -l 127.0.0.1:$PORT_CLIENT -r 127.0.0.1:$PORT_TUNNEL $FEC_ARGS --log-level 0 >/dev/null 2>&1 &
+    local client_pid=$!
+
+    sleep 1
+
+    # UDP sender: blasts 1400-byte packets for DURATION seconds
+    python3 -c "
+import socket, time
+sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+payload = b'\x00' * 1400
+end = time.monotonic() + $DURATION
+while time.monotonic() < end:
+    try:
+        sock.sendto(payload, ('127.0.0.1', $PORT_CLIENT))
+    except OSError:
+        pass
+"
+    echo "  sender done, waiting for receiver..." >&2
+
+    # Wait for receiver to exit naturally (2s socket timeout after last packet)
+    wait $recv_pid 2>/dev/null || true
+
+    # Kill tunnel processes
+    kill $server_pid $client_pid 2>/dev/null || true
+    wait $server_pid $client_pid 2>/dev/null || true
+
+    # Parse result
+    local result bytes elapsed
+    result=$(cat "$tmpfile")
+    rm -f "$tmpfile"
+
+    bytes=$(echo "$result" | awk '{print $1}')
+    elapsed=$(echo "$result" | awk '{print $2}')
+
+    echo "  received $bytes bytes in ${elapsed}s" >&2
+
+    if [[ -z "$bytes" || "$bytes" == "0" ]]; then
+        echo "0.0"
+        return
+    fi
+
+    python3 -c "print(f'{$bytes / $elapsed / 1e6 * 8:.1f}')"
+}
+
+# Warmup run (discarded) — primes tunnel, caches, socket buffers
+echo "  Warmup run..." >&2
+run_once > /dev/null
+
+# Run iterations and compute median
+results=()
+for i in $(seq 1 "$ITERATIONS"); do
+    echo "  Run $i/$ITERATIONS..." >&2
+    mbps=$(run_once)
+    results+=("$mbps")
+    echo "  → $mbps Mbps" >&2
+done
+
+IFS=$'\n' sorted=($(printf '%s\n' "${results[@]}" | sort -n)); unset IFS
+median_idx=$(( ITERATIONS / 2 ))
+median=${sorted[$median_idx]}
+median=${median:-0.0}
+
+if [[ $JSON -eq 1 ]]; then
+    printf '{"name": "throughput/%s", "unit": "Mbps", "value": %s}\n' "$FEC_LABEL" "$median"
+else
+    echo "Throughput ($FEC_LABEL): $median Mbps  [runs: ${results[*]}]"
+fi
diff --git a/common.h b/common.h
index 9f59799..25f7b65 100644
--- a/common.h
+++ b/common.h
@@ -270,13 +270,13 @@ struct address_t  // TODO scope id
     char *get_str();
     void to_str(char *);
 
-    inline int is_vaild() {
+    inline int is_valid() {
         u32_t ret = ((sockaddr *)&inner)->sa_family;
         return (ret == AF_INET || ret == AF_INET6);
     }
 
     inline u32_t get_type() {
-        assert(is_vaild());
+        assert(is_valid());
         u32_t ret = ((sockaddr *)&inner)->sa_family;
         return ret;
     }
diff --git a/connection.cpp b/connection.cpp
index 9a0ee89..5c256f0 100644
--- a/connection.cpp
+++ b/connection.cpp
@@ -6,6 +6,7 @@
  */
 
 #include "connection.h"
+#include "io_uring_recv.h"
 
 // const int disable_conv_clear=0;//a udp connection in the multiplexer is called conversation in this program,conv for short.
 
@@ -18,13 +19,22 @@ void server_clear_function(u64_t u64)  // used in conv_manager in server mode.fo
 {
     fd64_t fd64 = u64;
     assert(fd_manager.exist(fd64));
-    ev_io &watcher = fd_manager.get_info(fd64).io_watcher;
 
-    address_t &addr = fd_manager.get_info(fd64).addr;            //
-    assert(conn_manager.exist(addr));                            //
-    struct ev_loop *loop = conn_manager.find_insert(addr).loop;  // overkill ? should we just use ev_default_loop(0)?
+#ifdef __linux__
+    if (g_uring_ctx && g_uring_ctx->available) {
+        uring_cancel(g_uring_ctx, uring_tag(URING_TAG_SERVER_REMOTE, fd64));
+        uring_submit(g_uring_ctx);
+    } else
+#endif
+    {
+        ev_io &watcher = fd_manager.get_info(fd64).io_watcher;
 
-    ev_io_stop(loop, &watcher);
+        address_t &addr = fd_manager.get_info(fd64).addr;
+        assert(conn_manager.exist(addr));
+        struct ev_loop *loop = conn_manager.find_insert(addr).loop;
+
+        ev_io_stop(loop, &watcher);
+    }
 
     fd_manager.fd64_close(fd64);
 }
diff --git a/crc32/Crc32.cpp b/crc32/Crc32.cpp
index dd7b518..3581522 100644
--- a/crc32/Crc32.cpp
+++ b/crc32/Crc32.cpp
@@ -54,12 +54,12 @@
 #error "endian detection failed"
 #endif
 
-#if defined(IS_LITTLE_ENDIAN)
-  #define __BYTE_ORDER __LITTLE_ENDIAN
-#endif
-
-#if defined(IS_BIG_ENDIAN)
-  #define __BYTE_ORDER __BIG_ENDIAN
+#ifndef __BYTE_ORDER
+  #if defined(IS_LITTLE_ENDIAN)
+    #define __BYTE_ORDER __LITTLE_ENDIAN
+  #elif defined(IS_BIG_ENDIAN)
+    #define __BYTE_ORDER __BIG_ENDIAN
+  #endif
 #endif
 
 // define endianess and some integer data types
diff --git a/crc32c.h b/crc32c.h
new file mode 100644
index 0000000..0c1fb65
--- /dev/null
+++ b/crc32c.h
@@ -0,0 +1,194 @@
+#ifndef BENCH_CRC32C_H
+#define BENCH_CRC32C_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+/*
+ * CRC32C (Castagnoli) implementation — polynomial 0x82F63B78
+ *
+ * Three paths:
+ *   crc32c_sw()  — software slicing-by-8, works everywhere
+ *   crc32c_hw()  — hardware intrinsics (SSE4.2 / ARMv8-CRC)
+ *   crc32c()     — runtime dispatch to hw or sw
+ */
+
+/* ---- Software slicing-by-8 -------------------------------------------- */
+
+/*
+ * Lookup table for CRC32C polynomial 0x82F63B78 (bit-reversed 0x1EDC6F41).
+ * 8 slices x 256 entries. Generated at init time by crc32c_init_sw_table().
+ */
+static uint32_t crc32c_table[8][256];
+static int crc32c_table_ready = 0;
+
+static void crc32c_init_sw_table(void) {
+    if (crc32c_table_ready) return;
+    const uint32_t poly = 0x82F63B78;
+    for (int i = 0; i < 256; i++) {
+        uint32_t crc = (uint32_t)i;
+        for (int j = 0; j < 8; j++)
+            crc = (crc >> 1) ^ (poly & (-(int32_t)(crc & 1)));
+        crc32c_table[0][i] = crc;
+    }
+    for (int i = 0; i < 256; i++) {
+        uint32_t crc = crc32c_table[0][i];
+        for (int s = 1; s < 8; s++) {
+            crc = crc32c_table[0][crc & 0xFF] ^ (crc >> 8);
+            crc32c_table[s][i] = crc;
+        }
+    }
+    crc32c_table_ready = 1;
+}
+
+static uint32_t crc32c_sw(const void *data, size_t length, uint32_t previousCrc32 = 0) {
+    crc32c_init_sw_table();
+    const uint8_t *p = (const uint8_t *)data;
+    uint32_t crc = ~previousCrc32;
+
+    /* Process 8 bytes at a time */
+    while (length >= 8) {
+        crc ^= (uint32_t)p[0] | ((uint32_t)p[1] << 8) |
+               ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24);
+        crc = crc32c_table[7][crc & 0xFF] ^
+              crc32c_table[6][(crc >> 8) & 0xFF] ^
+              crc32c_table[5][(crc >> 16) & 0xFF] ^
+              crc32c_table[4][(crc >> 24) & 0xFF] ^
+              crc32c_table[3][p[4]] ^
+              crc32c_table[2][p[5]] ^
+              crc32c_table[1][p[6]] ^
+              crc32c_table[0][p[7]];
+        p += 8;
+        length -= 8;
+    }
+
+    /* Remaining bytes */
+    while (length--)
+        crc = crc32c_table[0][(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+
+    return ~crc;
+}
+
+/* ---- Hardware: x86_64 SSE4.2 ----------------------------------------- */
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
+#include <nmmintrin.h>
+
+#ifdef __GNUC__
+__attribute__((target("sse4.2")))
+#endif
+static uint32_t crc32c_hw(const void *data, size_t length, uint32_t previousCrc32 = 0) {
+    const uint8_t *p = (const uint8_t *)data;
+    uint64_t crc = ~(uint64_t)previousCrc32;
+
+#if defined(__x86_64__) || defined(_M_X64)
+    /* Process 8 bytes at a time on 64-bit */
+    while (length >= 8) {
+        uint64_t val;
+        __builtin_memcpy(&val, p, 8);
+        crc = _mm_crc32_u64(crc, val);
+        p += 8;
+        length -= 8;
+    }
+#endif
+    /* Process 4 bytes at a time */
+    while (length >= 4) {
+        uint32_t val;
+        __builtin_memcpy(&val, p, 4);
+        crc = _mm_crc32_u32((uint32_t)crc, val);
+        p += 4;
+        length -= 4;
+    }
+    /* Remaining bytes */
+    while (length--)
+        crc = _mm_crc32_u8((uint32_t)crc, *p++);
+
+    return ~(uint32_t)crc;
+}
+
+static int crc32c_has_hw(void) {
+    uint32_t eax, ebx, ecx, edx;
+    __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                         : "a"(1));
+    return (ecx >> 20) & 1; /* SSE4.2 bit */
+}
+
+/* ---- Hardware: ARMv8-A CRC extension ---------------------------------- */
+
+#elif defined(__aarch64__) || defined(__arm__)
+#ifdef __ARM_FEATURE_CRC32
+#include <arm_acle.h>
+
+static uint32_t crc32c_hw(const void *data, size_t length, uint32_t previousCrc32 = 0) {
+    const uint8_t *p = (const uint8_t *)data;
+    uint32_t crc = ~previousCrc32;
+
+#ifdef __aarch64__
+    while (length >= 8) {
+        uint64_t val;
+        __builtin_memcpy(&val, p, 8);
+        crc = __crc32cd(crc, val);
+        p += 8;
+        length -= 8;
+    }
+#endif
+    while (length >= 4) {
+        uint32_t val;
+        __builtin_memcpy(&val, p, 4);
+        crc = __crc32cw(crc, val);
+        p += 4;
+        length -= 4;
+    }
+    while (length--)
+        crc = __crc32cb(crc, *p++);
+
+    return ~crc;
+}
+
+static int crc32c_has_hw(void) {
+    return 1; /* If __ARM_FEATURE_CRC32 is defined, the compiler guarantees it */
+}
+
+#else /* ARM without CRC extension */
+
+static uint32_t crc32c_hw(const void *data, size_t length, uint32_t previousCrc32 = 0) {
+    return crc32c_sw(data, length, previousCrc32); /* fallback */
+}
+
+static int crc32c_has_hw(void) {
+    return 0;
+}
+
+#endif /* __ARM_FEATURE_CRC32 */
+
+/* ---- No hardware support ---------------------------------------------- */
+
+#else
+
+static uint32_t crc32c_hw(const void *data, size_t length, uint32_t previousCrc32 = 0) {
+    return crc32c_sw(data, length, previousCrc32);
+}
+
+static int crc32c_has_hw(void) {
+    return 0;
+}
+
+#endif /* architecture selection */
+
+/* ---- Runtime dispatch ------------------------------------------------- */
+
+typedef uint32_t (*crc32c_fn)(const void *, size_t, uint32_t);
+
+static uint32_t crc32c_resolve(const void *data, size_t length, uint32_t previousCrc32);
+static crc32c_fn crc32c_impl = crc32c_resolve;
+
+static uint32_t crc32c_resolve(const void *data, size_t length, uint32_t previousCrc32) {
+    crc32c_impl = crc32c_has_hw() ? crc32c_hw : crc32c_sw;
+    return crc32c_impl(data, length, previousCrc32);
+}
+
+static inline uint32_t crc32c(const void *data, size_t length, uint32_t previousCrc32 = 0) {
+    return crc32c_impl(data, length, previousCrc32);
+}
+
+#endif /* BENCH_CRC32C_H */
diff --git a/delay_manager.cpp b/delay_manager.cpp
index f8a9aa4..fdd1498 100644
--- a/delay_manager.cpp
+++ b/delay_manager.cpp
@@ -48,9 +48,7 @@ int delay_manager_t::add(my_time_t delay, const dest_t &dest, char *data, int le
         return -1;
     }
     if (delay == 0) {
-        static char buf[buf_len];
-        delay_data.data = buf;
-        memcpy(buf, data, len);
+        delay_data.data = data;
         int ret = delay_data.handle();
         if (ret != 0) {
             mylog(log_trace, "handle() return %d\n", ret);
diff --git a/fec_manager.cpp b/fec_manager.cpp
index 5d132c8..40bd917 100644
--- a/fec_manager.cpp
+++ b/fec_manager.cpp
@@ -510,39 +510,43 @@ int fec_decode_manager_t::input(char *s, int len) {
         mylog(log_warn, "data_num+redundant_num>=max_fec_packet_num\n");
         return -1;
     }
-    if (!anti_replay.is_vaild(seq)) {
-        mylog(log_trace, "!anti_replay.is_vaild(seq) ,seq =%u\n", seq);
+    if (inner_index >= data_num + redundant_num) {
+        mylog(log_warn, "inner_index(%d) >= data_num+redundant_num(%d+%d)\n", inner_index, data_num, redundant_num);
+        return -1;
+    }
+    if (!anti_replay.is_valid(seq)) {
+        mylog(log_trace, "!anti_replay.is_valid(seq) ,seq =%u\n", seq);
         return 0;
     }
 
-    if (mp[seq].fec_done != 0) {
+    fec_group_t &group = group_find_or_create(seq);
+
+    if (group.fec_done != 0) {
         mylog(log_debug, "fec already done, ignore, seq=%u\n", seq);
         return -1;
     }
 
-    if (mp[seq].group_mp.find(inner_index) != mp[seq].group_mp.end()) {
+    if (group.has_shard(inner_index)) {
         mylog(log_debug, "dup fec index\n");  // duplicate can happen on  a normal network, so its just log_debug
         return -1;
     }
 
-    if (mp[seq].type == -1)
-        mp[seq].type = type;
+    if (group.type == -1)
+        group.type = type;
     else {
-        if (mp[seq].type != type) {
+        if (group.type != type) {
             mylog(log_warn, "type mismatch\n");
             return -1;
         }
     }
 
     if (data_num != 0) {
-        // mp[seq].data_counter++;
-
-        if (mp[seq].data_num == -1) {
-            mp[seq].data_num = data_num;
-            mp[seq].redundant_num = redundant_num;
-            mp[seq].len = len;
+        if (group.data_num == -1) {
+            group.data_num = data_num;
+            group.redundant_num = redundant_num;
+            group.len = len;
         } else {
-            if (mp[seq].data_num != data_num || mp[seq].redundant_num != redundant_num || mp[seq].len != len) {
+            if (group.data_num != data_num || group.redundant_num != redundant_num || group.len != len) {
                 mylog(log_warn, "unexpected mp[seq].data_num!=data_num||mp[seq].redundant_num!=redundant_num||mp[seq].len!=len\n");
                 return -1;
             }
@@ -555,11 +559,11 @@ int fec_decode_manager_t::input(char *s, int len) {
         u32_t tmp_seq = fec_data[index].seq;
         anti_replay.set_invaild(tmp_seq);
 
-        auto tmp_it = mp.find(tmp_seq);
-        if (tmp_it != mp.end()) {
-            int x = tmp_it->second.data_num;
-            int y = tmp_it->second.redundant_num;
-            int cnt = tmp_it->second.group_mp.size();
+        fec_group_t *tmp_group = group_find(tmp_seq);
+        if (tmp_group) {
+            int x = tmp_group->data_num;
+            int y = tmp_group->redundant_num;
+            int cnt = tmp_group->shard_count;
 
             if (cnt < x) {
                 if (debug_fec_dec)
@@ -567,7 +571,7 @@ int fec_decode_manager_t::input(char *s, int len) {
                 else
                     mylog(log_trace, "[dec][failed]seq=%08x x=%d y=%d cnt=%d\n", tmp_seq, x, y, cnt);
             }
-            mp.erase(tmp_it);
+            group_erase(tmp_seq);
         }
         if (tmp_seq == seq) {
             mylog(log_warn, "unexpected tmp_seq==seq ,seq=%d\n", seq);
@@ -585,58 +589,57 @@ int fec_decode_manager_t::input(char *s, int len) {
     assert(0 <= index && index < (int)fec_buff_num);
     assert(len + 100 < buf_len);
     memcpy(fec_data[index].buf, s + tmp_idx, len);
-    mp[seq].group_mp[inner_index] = index;
+    group.set_shard(inner_index, index);
+    group.shard_count++;
     // index++ at end of function
 
-    map<int, int> &inner_mp = mp[seq].group_mp;
-
     int about_to_fec = 0;
     if (type == 0) {
-        // assert((int)inner_mp.size()<=data_num);
-        if ((int)inner_mp.size() > data_num) {
-            mylog(log_warn, "inner_mp.size()>data_num\n");
+        if (group.shard_count > data_num) {
+            mylog(log_warn, "shard_count>data_num\n");
             anti_replay.set_invaild(seq);
             goto end;
         }
-        if ((int)inner_mp.size() == data_num)
+        if (group.shard_count == data_num)
             about_to_fec = 1;
     } else {
-        if (mp[seq].data_num != -1) {
-            if ((int)inner_mp.size() > mp[seq].data_num + 1) {
-                mylog(log_warn, "inner_mp.size()>data_num+1\n");
+        if (group.data_num != -1) {
+            if (group.shard_count > group.data_num + 1) {
+                mylog(log_warn, "shard_count>data_num+1\n");
                 anti_replay.set_invaild(seq);
                 goto end;
             }
-            if ((int)inner_mp.size() >= mp[seq].data_num) {
+            if (group.shard_count >= group.data_num) {
                 about_to_fec = 1;
             }
         }
     }
 
     if (about_to_fec) {
-        int group_data_num = mp[seq].data_num;
-        int group_redundant_num = mp[seq].redundant_num;
+        int group_data_num = group.data_num;
+        int group_redundant_num = group.redundant_num;
 
         int x_got = 0;
         int y_got = 0;
         // mylog(log_error,"fec here!\n");
         if (type == 0) {
             char *fec_tmp_arr[max_fec_packet_num + 5] = {0};
-            for (auto it = inner_mp.begin(); it != inner_mp.end(); it++) {
-                if (it->first < group_data_num)
+            for (int i = 0; i < group_data_num + group_redundant_num; i++) {
+                if (!group.has_shard(i)) continue;
+                if (i < group_data_num)
                     x_got++;
                 else
                     y_got++;
-                fec_tmp_arr[it->first] = fec_data[it->second].buf;
+                fec_tmp_arr[i] = fec_data[group.shard_idx[i]].buf;
             }
             assert(rs_decode2(group_data_num, group_data_num + group_redundant_num, fec_tmp_arr, len) == 0);  // the input data has been modified in-place
             // this line should always succeed
-            mp[seq].fec_done = 1;
+            group.fec_done = 1;
 
             if (debug_fec_dec)
-                mylog(log_debug, "[dec]seq=%08x x=%d y=%d len=%d cnt=%d X=%d Y=%d\n", seq, group_data_num, group_redundant_num, len, int(inner_mp.size()), x_got, y_got);
+                mylog(log_debug, "[dec]seq=%08x x=%d y=%d len=%d cnt=%d X=%d Y=%d\n", seq, group_data_num, group_redundant_num, len, group.shard_count, x_got, y_got);
             else
-                mylog(log_trace, "[dec]seq=%08x x=%d y=%d len=%d cnt=%d X=%d Y=%d\n", seq, group_data_num, group_redundant_num, len, int(inner_mp.size()), x_got, y_got);
+                mylog(log_trace, "[dec]seq=%08x x=%d y=%d len=%d cnt=%d X=%d Y=%d\n", seq, group_data_num, group_redundant_num, len, group.shard_count, x_got, y_got);
 
             blob_decode.clear();
             for (int i = 0; i < group_data_num; i++) {
@@ -656,34 +659,32 @@ int fec_decode_manager_t::input(char *s, int len) {
             int max_len = -1;
             int fec_result_ok = 1;
             int data_check_ok = 1;
-            int debug_num = inner_mp.size();
+            int debug_num = group.shard_count;
 
             int missed_packet[max_fec_packet_num + 5];
             int missed_packet_counter = 0;
 
-            // outupt_s_arr_buf[max_fec_packet_num+5]={0};
-
-            // memset(output_s_arr_buf,0,sizeof(output_s_arr_buf));//in efficient
-
             for (int i = 0; i < group_data_num + group_redundant_num; i++) {
-                output_s_arr_buf[i] = 0;
-            }
-            for (auto it = inner_mp.begin(); it != inner_mp.end(); it++) {
-                if (it->first < group_data_num)
+                if (!group.has_shard(i)) {
+                    output_s_arr_buf[i] = 0;
+                    continue;
+                }
+                int di = group.shard_idx[i];
+                if (i < group_data_num)
                     x_got++;
                 else
                     y_got++;
 
-                output_s_arr_buf[it->first] = fec_data[it->second].buf;
-                if (fec_data[it->second].len < (int)sizeof(u16_t)) {
-                    mylog(log_warn, "fec_data[it->second].len<(int)sizeof(u16_t)");
+                output_s_arr_buf[i] = fec_data[di].buf;
+                if (fec_data[di].len < (int)sizeof(u16_t)) {
+                    mylog(log_warn, "fec_data[di].len<(int)sizeof(u16_t)");
                     data_check_ok = 0;
                 }
 
-                if (fec_data[it->second].len > max_len)
-                    max_len = fec_data[it->second].len;
+                if (fec_data[di].len > max_len)
+                    max_len = fec_data[di].len;
             }
-            if (max_len != mp[seq].len) {
+            if (max_len != group.len) {
                 data_check_ok = 0;
                 mylog(log_warn, "max_len!=mp[seq].len");
             }
@@ -693,10 +694,13 @@ int fec_decode_manager_t::input(char *s, int len) {
                 anti_replay.set_invaild(seq);
                 goto end;
             }
-            for (auto it = inner_mp.begin(); it != inner_mp.end(); it++) {
-                int tmp_idx = it->second;
-                assert(max_len >= fec_data[tmp_idx].len);  // guarenteed by data_check_ok
-                memset(fec_data[tmp_idx].buf + fec_data[tmp_idx].len, 0, max_len - fec_data[tmp_idx].len);
+            for (int i = 0; i < group_data_num + group_redundant_num; i++) {
+                if (!group.has_shard(i)) continue;
+                int di = group.shard_idx[i];
+                assert(max_len >= fec_data[di].len);  // guarenteed by data_check_ok
+                int pad = max_len - fec_data[di].len;
+                if (pad > 0)
+                    memset(fec_data[di].buf + fec_data[di].len, 0, pad);
             }
 
             for (int i = 0; i < group_data_num; i++) {
@@ -708,7 +712,7 @@ int fec_decode_manager_t::input(char *s, int len) {
             mylog(log_trace, "fec done,%d %d,missed_packet_counter=%d\n", group_data_num, group_redundant_num, missed_packet_counter);
 
             assert(rs_decode2(group_data_num, group_data_num + group_redundant_num, output_s_arr_buf, max_len) == 0);  // this should always succeed
-            mp[seq].fec_done = 1;
+            group.fec_done = 1;
 
             int sum_ori = 0;
 
diff --git a/fec_manager.h b/fec_manager.h
index f92b651..dd809dd 100644
--- a/fec_manager.h
+++ b/fec_manager.h
@@ -13,7 +13,7 @@
 #include "lib/rs.h"
 
 const int max_blob_packet_num = 30000;      // how many packet can be contain in a blob_t ,can be set very large
-const u32_t anti_replay_buff_size = 30000;  // can be set very large
+const u32_t anti_replay_table_size = 32768;  // power of 2 for fast modulo
 
 const int max_fec_packet_num = 255;  // this is the limitation of the rs lib
 extern u32_t fec_buff_num;
@@ -182,55 +182,29 @@ struct fec_parameter_t {
 extern fec_parameter_t g_fec_par;
 // extern int dynamic_update_fec;
 
-const int anti_replay_timeout = 120 * 1000;  // 120s
-
 struct anti_replay_t {
-    struct info_t {
-        my_time_t my_time;
-        int index;
-    };
+    /* Direct-mapped table: slot = seq & MASK, stores the seq that owns it.
+     * is_valid: table[slot] != seq → valid (not yet seen).
+     * set_invaild: table[slot] = seq.
+     * Old entries naturally evicted when a new seq maps to the same slot.
+     * With 32K slots and monotonically increasing seqs, effective window
+     * is ~32K groups — comparable to the old 30K ring buffer. */
+    static const u32_t TABLE_MASK = anti_replay_table_size - 1;
+
+    u32_t table[anti_replay_table_size];
 
-    u64_t replay_buffer[anti_replay_buff_size];
-    unordered_map<u32_t, info_t> mp;
-    int index;
     anti_replay_t() {
         clear();
     }
     int clear() {
-        memset(replay_buffer, -1, sizeof(replay_buffer));
-        mp.clear();
-        mp.rehash(anti_replay_buff_size * 3);
-        index = 0;
+        memset(table, 0xFF, sizeof(table));
         return 0;
     }
     void set_invaild(u32_t seq) {
-        if (is_vaild(seq) == 0) {
-            mylog(log_trace, "seq %u exist\n", seq);
-            // assert(mp.find(seq)!=mp.end());
-            // mp[seq].my_time=get_current_time_rough();
-            return;
-        }
-        if (replay_buffer[index] != u64_t(i64_t(-1))) {
-            assert(mp.find(replay_buffer[index]) != mp.end());
-            mp.erase(replay_buffer[index]);
-        }
-        replay_buffer[index] = seq;
-        assert(mp.find(seq) == mp.end());
-        mp[seq].my_time = get_current_time();
-        mp[seq].index = index;
-        index++;
-        if (index == int(anti_replay_buff_size)) index = 0;
+        table[seq & TABLE_MASK] = seq;
     }
-    int is_vaild(u32_t seq) {
-        if (mp.find(seq) == mp.end()) return 1;
-
-        if (get_current_time() - mp[seq].my_time > anti_replay_timeout) {
-            replay_buffer[mp[seq].index] = u64_t(i64_t(-1));
-            mp.erase(seq);
-            return 1;
-        }
-
-        return 0;
+    int is_valid(u32_t seq) {
+        return table[seq & TABLE_MASK] != seq;
     }
 };
 
@@ -374,18 +348,45 @@ struct fec_data_t {
     int len;
 };
 struct fec_group_t {
-    int type = -1;
-    int data_num = -1;
-    int redundant_num = -1;
-    int len = -1;
-    int fec_done = 0;
-    // int data_counter=0;
-    map<int, int> group_mp;
+    u32_t seq;           /* owner seq, 0xFFFFFFFF = empty slot */
+    int type;
+    int data_num;
+    int redundant_num;
+    int len;
+    int fec_done;
+    int shard_count;
+    u32_t shard_bitmap[8];  /* 256 bits — replaces 1KB memset of shard_idx */
+    int shard_idx[max_fec_packet_num + 1];  /* only valid where bitmap bit is set */
+
+    void init(u32_t new_seq) {
+        seq = new_seq;
+        type = -1;
+        data_num = -1;
+        redundant_num = -1;
+        len = -1;
+        fec_done = 0;
+        shard_count = 0;
+        memset(shard_bitmap, 0, sizeof(shard_bitmap));  /* 32 bytes vs old 1024 */
+    }
+    int has_shard(int i) const {
+        return (shard_bitmap[i >> 5] >> (i & 31)) & 1;
+    }
+    void set_shard(int i, int val) {
+        shard_bitmap[i >> 5] |= (1u << (i & 31));
+        shard_idx[i] = val;
+    }
 };
 class fec_decode_manager_t : not_copy_able_t {
     anti_replay_t anti_replay;
     fec_data_t *fec_data = 0;
-    unordered_map<u32_t, fec_group_t> mp;
+
+    /* Direct-mapped group table: slot = seq & group_table_mask.
+     * Monotonically increasing seqs guarantee no two concurrent groups
+     * collide (table_size > max concurrent groups ≈ fec_buff_num). */
+    fec_group_t *group_table = 0;
+    u32_t group_table_size;
+    u32_t group_table_mask;
+
     blob_decode_t blob_decode;
 
     int index;
@@ -398,28 +399,45 @@ class fec_decode_manager_t : not_copy_able_t {
     char *output_s_arr_buf[max_fec_packet_num + 100];  // only for type=1,for type=0 the buf inside blot_t is used
     int output_len_arr_buf[max_fec_packet_num + 100];  // same
 
+    fec_group_t &group_find_or_create(u32_t seq) {
+        fec_group_t &g = group_table[seq & group_table_mask];
+        if (g.seq != seq) g.init(seq);
+        return g;
+    }
+    fec_group_t *group_find(u32_t seq) {
+        fec_group_t &g = group_table[seq & group_table_mask];
+        return (g.seq == seq) ? &g : 0;
+    }
+    void group_erase(u32_t seq) {
+        fec_group_t &g = group_table[seq & group_table_mask];
+        if (g.seq == seq) g.seq = 0xFFFFFFFF;
+    }
+
    public:
     fec_decode_manager_t() {
+        /* Table size: next power of 2 >= fec_buff_num * 2 */
+        group_table_size = 1;
+        while (group_table_size < fec_buff_num * 2) group_table_size <<= 1;
+        group_table_mask = group_table_size - 1;
+
         fec_data = new fec_data_t[fec_buff_num + 5];
+        group_table = new fec_group_t[group_table_size];
         assert(fec_data != 0);
+        assert(group_table != 0);
         clear();
     }
-    /*
-    fec_decode_manager_t(const fec_decode_manager_t &b)
-    {
-            assert(0==1);//not allowed to copy
-    }*/
     ~fec_decode_manager_t() {
         mylog(log_debug, "fec_decode_manager destroyed\n");
         if (fec_data != 0) {
             mylog(log_debug, "fec_data freed\n");
             delete[] fec_data;
         }
+        delete[] group_table;
     }
     int clear() {
         anti_replay.clear();
-        mp.clear();
-        mp.rehash(fec_buff_num * 3);
+        for (u32_t i = 0; i < group_table_size; i++)
+            group_table[i].seq = 0xFFFFFFFF;
 
         for (int i = 0; i < (int)fec_buff_num; i++)
             fec_data[i].used = 0;
diff --git a/io_uring_recv.cpp b/io_uring_recv.cpp
new file mode 100644
index 0000000..7e23afc
--- /dev/null
+++ b/io_uring_recv.cpp
@@ -0,0 +1,471 @@
+#include "io_uring_recv.h"
+
+uring_ctx_t *g_uring_ctx = NULL;
+
+#ifdef __linux__
+
+#include "log.h"
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+
+/* --- Raw syscall wrappers ------------------------------------------------ */
+
+static int
+sys_io_uring_setup(unsigned entries, struct io_uring_params *p)
+{
+    return (int)syscall(__NR_io_uring_setup, entries, p);
+}
+
+static int
+sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete,
+                   unsigned flags, void *arg, size_t argsz)
+{
+    return (int)syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
+                        flags, arg, argsz);
+}
+
+static int
+sys_io_uring_register(int fd, unsigned opcode, void *arg, unsigned nr_args)
+{
+    return (int)syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
+}
+
+/* --- Memory barrier helpers ---------------------------------------------- */
+
+static inline void
+io_uring_smp_store_release(unsigned *p, unsigned v)
+{
+    __atomic_store_n(p, v, __ATOMIC_RELEASE);
+}
+
+static inline unsigned
+io_uring_smp_load_acquire(const unsigned *p)
+{
+    return __atomic_load_n(p, __ATOMIC_ACQUIRE);
+}
+
+/* --- SQE helpers --------------------------------------------------------- */
+
+static struct io_uring_sqe *
+get_sqe(uring_ctx_t *ctx)
+{
+    unsigned head = io_uring_smp_load_acquire(ctx->sq_head);
+    unsigned tail = *ctx->sq_tail;
+    if (tail - head >= ctx->sq_entries)
+        return NULL; /* SQ full */
+    struct io_uring_sqe *sqe = &ctx->sqes[tail & ctx->sq_mask];
+    return sqe;
+}
+
+static void
+submit_sqe(uring_ctx_t *ctx)
+{
+    unsigned tail = *ctx->sq_tail;
+    ctx->sq_array[tail & ctx->sq_mask] = tail & ctx->sq_mask;
+    io_uring_smp_store_release(ctx->sq_tail, tail + 1);
+}
+
+/* --- Buffer ring helpers ------------------------------------------------- */
+
+static void
+buf_ring_add(uring_ctx_t *ctx, int buf_id)
+{
+    struct io_uring_buf_ring *br = ctx->buf_ring;
+    unsigned short idx = br->tail;
+    struct io_uring_buf *buf = &br->bufs[idx & (ctx->buf_count - 1)];
+    buf->addr = (unsigned long long)(ctx->buf_pool + (long)buf_id * ctx->buf_size + URING_RECV_HEADROOM);
+    buf->len = (__u32)(ctx->buf_size - URING_RECV_HEADROOM);
+    buf->bid = (__u16)buf_id;
+    /* For init: publish immediately.  For runtime: use buf_ring_add_deferred + commit. */
+    __atomic_store_n(&br->tail, (__u16)(idx + 1), __ATOMIC_RELEASE);
+}
+
+static void
+buf_ring_add_deferred(uring_ctx_t *ctx, int buf_id)
+{
+    /* Add entry without publishing (no atomic store on tail).
+       Caller must call uring_buf_ring_commit() when done. */
+    struct io_uring_buf_ring *br = ctx->buf_ring;
+    unsigned short idx = ctx->buf_ring_pending;
+    struct io_uring_buf *buf = &br->bufs[idx & (ctx->buf_count - 1)];
+    buf->addr = (unsigned long long)(ctx->buf_pool + (long)buf_id * ctx->buf_size + URING_RECV_HEADROOM);
+    buf->len = (__u32)(ctx->buf_size - URING_RECV_HEADROOM);
+    buf->bid = (__u16)buf_id;
+    ctx->buf_ring_pending = (__u16)(idx + 1);
+}
+
+/* --- Public API ---------------------------------------------------------- */
+
+int
+uring_init(uring_ctx_t *ctx, int queue_depth, int buf_count, int buf_size)
+{
+    if (getenv("UDPSPEEDER_NO_URING")) {
+        mylog(log_info, "io_uring: disabled by UDPSPEEDER_NO_URING\n");
+        ctx->available = 0;
+        return -1;
+    }
+    memset(ctx, 0, sizeof(*ctx));
+    ctx->ring_fd = -1;
+    ctx->available = 0;
+    ctx->bgid = 0;
+    ctx->buf_count = buf_count;
+    ctx->buf_size = buf_size;
+
+    /* buf_count must be power of 2 for the ring */
+    if (buf_count & (buf_count - 1)) {
+        mylog(log_warn, "io_uring: buf_count must be power of 2\n");
+        return -1;
+    }
+
+    /* 1. io_uring_setup */
+    struct io_uring_params params;
+    memset(&params, 0, sizeof(params));
+    /* CQ sized 4x buf_count: 2 multishot requests share the CQ, and we need
+       headroom for error/cancel CQEs that don't consume buffers. */
+    params.flags = IORING_SETUP_CQSIZE;
+    params.cq_entries = (unsigned)(buf_count * 4);
+
+    /* Try performance flags.  Fall back if kernel rejects. */
+    unsigned opt_flags = 0;
+#ifdef IORING_SETUP_COOP_TASKRUN
+    opt_flags |= IORING_SETUP_COOP_TASKRUN;
+#endif
+#ifdef IORING_SETUP_SINGLE_ISSUER
+    opt_flags |= IORING_SETUP_SINGLE_ISSUER;
+#endif
+
+    params.flags |= opt_flags;
+    int fd = sys_io_uring_setup((unsigned)queue_depth, &params);
+    if (fd < 0 && opt_flags) {
+        /* Retry without optional flags */
+        memset(&params, 0, sizeof(params));
+        params.flags = IORING_SETUP_CQSIZE;
+        params.cq_entries = (unsigned)(buf_count * 4);
+        fd = sys_io_uring_setup((unsigned)queue_depth, &params);
+    }
+    if (fd < 0) {
+        mylog(log_info, "io_uring: io_uring_setup failed (errno %d), using fallback\n", errno);
+        return -1;
+    }
+    ctx->ring_fd = fd;
+    ctx->sq_entries = params.sq_entries;
+    ctx->cq_entries = params.cq_entries;
+
+    /* 2. mmap SQ ring */
+    ctx->sq_ring_sz = (size_t)(params.sq_off.array + params.sq_entries * sizeof(unsigned));
+    ctx->sq_ring_ptr = mmap(NULL, ctx->sq_ring_sz, PROT_READ | PROT_WRITE,
+                            MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
+    if (ctx->sq_ring_ptr == MAP_FAILED) {
+        mylog(log_warn, "io_uring: mmap SQ ring failed\n");
+        goto fail;
+    }
+    ctx->sq_head = (unsigned *)((char *)ctx->sq_ring_ptr + params.sq_off.head);
+    ctx->sq_tail = (unsigned *)((char *)ctx->sq_ring_ptr + params.sq_off.tail);
+    ctx->sq_mask = *(unsigned *)((char *)ctx->sq_ring_ptr + params.sq_off.ring_mask);
+    ctx->sq_array = (unsigned *)((char *)ctx->sq_ring_ptr + params.sq_off.array);
+
+    /* 3. mmap SQEs */
+    ctx->sqes_sz = (size_t)(params.sq_entries * sizeof(struct io_uring_sqe));
+    ctx->sqes = (struct io_uring_sqe *)mmap(NULL, ctx->sqes_sz, PROT_READ | PROT_WRITE,
+                                             MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
+    if (ctx->sqes == MAP_FAILED) {
+        mylog(log_warn, "io_uring: mmap SQEs failed\n");
+        goto fail;
+    }
+
+    /* 4. mmap CQ ring */
+    ctx->cq_ring_sz = (size_t)(params.cq_off.cqes + params.cq_entries * sizeof(struct io_uring_cqe));
+    ctx->cq_ring_ptr = mmap(NULL, ctx->cq_ring_sz, PROT_READ | PROT_WRITE,
+                            MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
+    if (ctx->cq_ring_ptr == MAP_FAILED) {
+        mylog(log_warn, "io_uring: mmap CQ ring failed\n");
+        goto fail;
+    }
+    ctx->cq_head = (unsigned *)((char *)ctx->cq_ring_ptr + params.cq_off.head);
+    ctx->cq_tail = (unsigned *)((char *)ctx->cq_ring_ptr + params.cq_off.tail);
+    ctx->cq_mask = *(unsigned *)((char *)ctx->cq_ring_ptr + params.cq_off.ring_mask);
+    ctx->cqes = (struct io_uring_cqe *)((char *)ctx->cq_ring_ptr + params.cq_off.cqes);
+
+    /* 5. Allocate buffer pool */
+    ctx->buf_pool = (char *)aligned_alloc(4096, (size_t)buf_count * (size_t)buf_size);
+    if (!ctx->buf_pool) {
+        mylog(log_warn, "io_uring: buf_pool alloc failed\n");
+        goto fail;
+    }
+
+    /* 6. Set up provided buffer ring */
+    {
+        size_t ring_sz = sizeof(struct io_uring_buf_ring) +
+                         (size_t)buf_count * sizeof(struct io_uring_buf);
+        /* Must be page-aligned for kernel registration */
+        size_t page = (size_t)sysconf(_SC_PAGESIZE);
+        ring_sz = (ring_sz + page - 1) & ~(page - 1);
+
+        ctx->buf_ring = (struct io_uring_buf_ring *)mmap(
+            NULL, ring_sz, PROT_READ | PROT_WRITE,
+            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+        if (ctx->buf_ring == MAP_FAILED) {
+            ctx->buf_ring = NULL;
+            mylog(log_warn, "io_uring: buf_ring mmap failed\n");
+            goto fail;
+        }
+        memset(ctx->buf_ring, 0, ring_sz);
+        ctx->buf_ring->tail = 0;
+        ctx->buf_ring_pending = 0;
+
+        struct io_uring_buf_reg reg;
+        memset(&reg, 0, sizeof(reg));
+        reg.ring_addr = (unsigned long long)ctx->buf_ring;
+        reg.ring_entries = (__u32)buf_count;
+        reg.bgid = (__u16)ctx->bgid;
+
+        int ret = sys_io_uring_register(fd, IORING_REGISTER_PBUF_RING, &reg, 1);
+        if (ret < 0) {
+            mylog(log_info, "io_uring: REGISTER_PBUF_RING failed (errno %d), kernel too old?\n", errno);
+            goto fail;
+        }
+
+        /* Populate the buffer ring with all buffers */
+        for (int i = 0; i < buf_count; i++) {
+            buf_ring_add(ctx, i);
+        }
+        ctx->buf_ring_pending = ctx->buf_ring->tail;
+    }
+
+    /* 7. Initialize msghdr template for recvmsg */
+    memset(&ctx->recvmsg_hdr, 0, sizeof(ctx->recvmsg_hdr));
+    memset(&ctx->recvmsg_name, 0, sizeof(ctx->recvmsg_name));
+    ctx->recvmsg_hdr.msg_name = &ctx->recvmsg_name;
+    ctx->recvmsg_hdr.msg_namelen = sizeof(ctx->recvmsg_name);
+
+    ctx->available = 1;
+    mylog(log_info, "io_uring: initialized (ring_fd=%d, %d buffers × %d bytes, cq=%u)\n",
+          fd, buf_count, buf_size, ctx->cq_entries);
+    return 0;
+
+fail:
+    uring_destroy(ctx);
+    return -1;
+}
+
+void
+uring_destroy(uring_ctx_t *ctx)
+{
+    if (ctx->buf_ring) {
+        size_t page = (size_t)sysconf(_SC_PAGESIZE);
+        size_t ring_sz = sizeof(struct io_uring_buf_ring) +
+                         (size_t)ctx->buf_count * sizeof(struct io_uring_buf);
+        ring_sz = (ring_sz + page - 1) & ~(page - 1);
+        munmap(ctx->buf_ring, ring_sz);
+        ctx->buf_ring = NULL;
+    }
+    free(ctx->buf_pool);
+    ctx->buf_pool = NULL;
+
+    if (ctx->sqes && ctx->sqes != MAP_FAILED)
+        munmap(ctx->sqes, ctx->sqes_sz);
+    if (ctx->sq_ring_ptr && ctx->sq_ring_ptr != MAP_FAILED)
+        munmap(ctx->sq_ring_ptr, ctx->sq_ring_sz);
+    if (ctx->cq_ring_ptr && ctx->cq_ring_ptr != MAP_FAILED)
+        munmap(ctx->cq_ring_ptr, ctx->cq_ring_sz);
+
+    if (ctx->ring_fd >= 0)
+        close(ctx->ring_fd);
+
+    ctx->ring_fd = -1;
+    ctx->available = 0;
+}
+
+int
+uring_add_multishot_recvmsg(uring_ctx_t *ctx, int fd, uint64_t user_data)
+{
+    struct io_uring_sqe *sqe = get_sqe(ctx);
+    if (!sqe) return -1;
+
+    memset(sqe, 0, sizeof(*sqe));
+    sqe->opcode = IORING_OP_RECVMSG;
+    sqe->fd = fd;
+    sqe->user_data = user_data;
+    sqe->flags = IOSQE_BUFFER_SELECT;
+    sqe->ioprio = IORING_RECV_MULTISHOT;
+    sqe->addr = (unsigned long long)&ctx->recvmsg_hdr;
+    sqe->buf_group = (__u16)ctx->bgid;
+
+    submit_sqe(ctx);
+    return 0;
+}
+
+int
+uring_add_multishot_recv(uring_ctx_t *ctx, int fd, uint64_t user_data)
+{
+    struct io_uring_sqe *sqe = get_sqe(ctx);
+    if (!sqe) return -1;
+
+    memset(sqe, 0, sizeof(*sqe));
+    sqe->opcode = IORING_OP_RECV;
+    sqe->fd = fd;
+    sqe->user_data = user_data;
+    sqe->flags = IOSQE_BUFFER_SELECT;
+    sqe->ioprio = IORING_RECV_MULTISHOT;
+    sqe->buf_group = (__u16)ctx->bgid;
+
+    submit_sqe(ctx);
+    return 0;
+}
+
+int
+uring_cancel(uring_ctx_t *ctx, uint64_t user_data)
+{
+    struct io_uring_sqe *sqe = get_sqe(ctx);
+    if (!sqe) return -1;
+
+    memset(sqe, 0, sizeof(*sqe));
+    sqe->opcode = IORING_OP_ASYNC_CANCEL;
+    sqe->addr = user_data; /* cancels SQE matching this user_data */
+
+    submit_sqe(ctx);
+    return 0;
+}
+
+int
+uring_submit(uring_ctx_t *ctx)
+{
+    unsigned submitted = *ctx->sq_tail - io_uring_smp_load_acquire(ctx->sq_head);
+    if (submitted == 0) return 0;
+
+    int ret = sys_io_uring_enter(ctx->ring_fd, submitted, 0,
+                                  IORING_ENTER_SQ_WAKEUP, NULL, 0);
+    if (ret < 0) {
+        mylog(log_warn, "io_uring: io_uring_enter submit failed (errno %d)\n", errno);
+        return -1;
+    }
+    return ret;
+}
+
+int
+uring_submit_and_flush(uring_ctx_t *ctx)
+{
+    unsigned submitted = *ctx->sq_tail - io_uring_smp_load_acquire(ctx->sq_head);
+    unsigned flags = IORING_ENTER_GETEVENTS;
+    if (submitted > 0)
+        flags |= IORING_ENTER_SQ_WAKEUP;
+
+    int ret = sys_io_uring_enter(ctx->ring_fd, submitted, 0, flags, NULL, 0);
+    if (ret < 0) {
+        mylog(log_warn, "io_uring: io_uring_enter submit+flush failed (errno %d)\n", errno);
+        return -1;
+    }
+    return ret;
+}
+
+/* --- Batched CQ drain API ------------------------------------------------ */
+
+unsigned
+uring_cq_ready(uring_ctx_t *ctx)
+{
+    /* Acquire on tail ensures we see CQE data the kernel wrote before updating tail */
+    unsigned head = *ctx->cq_head;  /* our variable, no barrier needed */
+    unsigned tail = io_uring_smp_load_acquire(ctx->cq_tail);
+    return tail - head;
+}
+
+struct io_uring_cqe *
+uring_cqe_at(uring_ctx_t *ctx, unsigned idx)
+{
+    /* idx is offset from current cq_head */
+    unsigned head = *ctx->cq_head;
+    return &ctx->cqes[(head + idx) & ctx->cq_mask];
+}
+
+void
+uring_cq_advance(uring_ctx_t *ctx, unsigned n)
+{
+    if (n == 0) return;
+    unsigned head = *ctx->cq_head;
+    io_uring_smp_store_release(ctx->cq_head, head + n);
+}
+
+/* --- Batched buffer ring API --------------------------------------------- */
+
+void
+uring_recycle_buf(uring_ctx_t *ctx, int buf_id)
+{
+    buf_ring_add_deferred(ctx, buf_id);
+}
+
+void
+uring_buf_ring_commit(uring_ctx_t *ctx)
+{
+    /* Single atomic publish of all deferred buffer additions */
+    __atomic_store_n(&ctx->buf_ring->tail, ctx->buf_ring_pending, __ATOMIC_RELEASE);
+}
+
+void
+uring_flush(uring_ctx_t *ctx)
+{
+    /* Trigger deferred completions by entering with GETEVENTS */
+    sys_io_uring_enter(ctx->ring_fd, 0, 0, IORING_ENTER_GETEVENTS, NULL, 0);
+}
+
+int
+uring_parse_recvmsg_cqe(uring_ctx_t *ctx, struct io_uring_cqe *cqe,
+                          uring_recv_buf_t *out)
+{
+    if (cqe->res < 0) return -1;
+
+    if (!(cqe->flags & IORING_CQE_F_BUFFER)) {
+        mylog(log_debug, "io_uring: recvmsg CQE missing BUFFER flag\n");
+        return -1;
+    }
+
+    int buf_id = (int)(cqe->flags >> IORING_CQE_BUFFER_SHIFT);
+    if (buf_id < 0 || buf_id >= ctx->buf_count) return -1;
+
+    /* Kernel writes at registered addr = pool + id*buf_size + HEADROOM */
+    char *kernel_start = ctx->buf_pool + (long)buf_id * ctx->buf_size + URING_RECV_HEADROOM;
+    struct io_uring_recvmsg_out *hdr = (struct io_uring_recvmsg_out *)kernel_start;
+
+    out->buf_id = buf_id;
+    out->addr_len = (socklen_t)(hdr->namelen < sizeof(out->addr) ? hdr->namelen : sizeof(out->addr));
+    memcpy(&out->addr, kernel_start + sizeof(*hdr), out->addr_len);
+    /* Kernel reserves msg_namelen bytes (from template) for name area,
+       not hdr->namelen (actual). Use template sizes for offset. */
+    int header_len = (int)(sizeof(*hdr) + ctx->recvmsg_hdr.msg_namelen
+                           + ctx->recvmsg_hdr.msg_controllen);
+    out->data = kernel_start + header_len;
+    int max_payload = ctx->buf_size - URING_RECV_HEADROOM - header_len;
+    out->len = (int)hdr->payloadlen;
+    if (out->len > max_payload) out->len = max_payload;
+
+    return 0;
+}
+
+int
+uring_parse_recv_cqe(uring_ctx_t *ctx, struct io_uring_cqe *cqe,
+                      uring_recv_buf_t *out)
+{
+    if (cqe->res < 0) return -1;
+
+    if (!(cqe->flags & IORING_CQE_F_BUFFER)) {
+        mylog(log_debug, "io_uring: recv CQE missing BUFFER flag\n");
+        return -1;
+    }
+
+    int buf_id = (int)(cqe->flags >> IORING_CQE_BUFFER_SHIFT);
+    if (buf_id < 0 || buf_id >= ctx->buf_count) return -1;
+
+    /* Kernel writes at registered addr = pool + id*buf_size + HEADROOM */
+    char *kernel_start = ctx->buf_pool + (long)buf_id * ctx->buf_size + URING_RECV_HEADROOM;
+    out->buf_id = buf_id;
+    out->data = kernel_start;
+    out->len = cqe->res;
+    out->addr_len = 0;
+
+    return 0;
+}
+
+#endif /* __linux__ */
diff --git a/io_uring_recv.h b/io_uring_recv.h
new file mode 100644
index 0000000..4ed24a3
--- /dev/null
+++ b/io_uring_recv.h
@@ -0,0 +1,150 @@
+#ifndef IO_URING_RECV_H_
+#define IO_URING_RECV_H_
+
+#include "common.h"
+
+#ifdef __linux__
+
+#include <stdint.h>
+#include <sys/socket.h>
+
+/* --- Kernel constant fallbacks (for older headers) ----------------------- */
+
+#include <linux/io_uring.h>
+#include <sys/syscall.h>
+
+/*
+ * Macro fallbacks — these are #define'd in kernel headers (not enums),
+ * so #ifndef works reliably.  Struct/enum fallbacks are NOT provided;
+ * compilation requires kernel headers 6.0+ (Ubuntu 22.04 HWE or 24.04).
+ * Runtime probe handles older kernels gracefully.
+ */
+#ifndef IORING_RECV_MULTISHOT
+#define IORING_RECV_MULTISHOT (1U << 1)
+#endif
+#ifndef IORING_CQE_F_MORE
+#define IORING_CQE_F_MORE (1U << 1)
+#endif
+#ifndef IORING_CQE_F_BUFFER
+#define IORING_CQE_F_BUFFER (1U << 0)
+#endif
+#ifndef IORING_CQE_BUFFER_SHIFT
+#define IORING_CQE_BUFFER_SHIFT 16
+#endif
+
+/* --- Public API ---------------------------------------------------------- */
+
+struct uring_recv_buf_t {
+    char *data;
+    int len;
+    struct sockaddr_storage addr;
+    socklen_t addr_len;
+    int buf_id;
+};
+
+struct uring_ctx_t {
+    int ring_fd;
+    int available;
+
+    /* mmap'd ring pointers */
+    void *sq_ring_ptr;
+    size_t sq_ring_sz;
+    void *cq_ring_ptr;
+    size_t cq_ring_sz;
+    struct io_uring_sqe *sqes;
+    size_t sqes_sz;
+
+    /* SQ ring offsets */
+    unsigned *sq_head;
+    unsigned *sq_tail;
+    unsigned *sq_array;
+    unsigned sq_mask;
+    unsigned sq_entries;
+
+    /* CQ ring offsets */
+    unsigned *cq_head;
+    unsigned *cq_tail;
+    unsigned cq_mask;
+    unsigned cq_entries;
+    struct io_uring_cqe *cqes;
+
+    /* Provided buffer ring */
+    struct io_uring_buf_ring *buf_ring;
+    unsigned short buf_ring_pending; /* shadow tail for deferred recycling */
+    char *buf_pool;
+    int buf_count;
+    int buf_size;       /* size per buffer including header room */
+    int bgid;
+
+    /* msghdr template for multishot recvmsg */
+    struct msghdr recvmsg_hdr;
+    struct sockaddr_storage recvmsg_name;
+};
+
+/* User data tag encode/decode */
+static inline uint64_t uring_tag(uint8_t type, uint64_t payload) {
+    return ((uint64_t)type << 56) | (payload & 0x00FFFFFFFFFFFFFFULL);
+}
+static inline uint8_t uring_tag_type(uint64_t user_data) {
+    return (uint8_t)(user_data >> 56);
+}
+static inline uint64_t uring_tag_payload(uint64_t user_data) {
+    return user_data & 0x00FFFFFFFFFFFFFFULL;
+}
+
+/* Tag types */
+#define URING_TAG_CLIENT_LOCAL   0x01
+#define URING_TAG_CLIENT_REMOTE  0x02
+#define URING_TAG_SERVER_LOCAL   0x03
+#define URING_TAG_SERVER_REMOTE  0x04
+
+/* Headroom reserved before each provided buffer for in-place conv header */
+#define URING_RECV_HEADROOM  4  /* sizeof(u32_t) */
+
+int  uring_init(uring_ctx_t *ctx, int queue_depth, int buf_count, int buf_size);
+void uring_destroy(uring_ctx_t *ctx);
+
+int  uring_add_multishot_recvmsg(uring_ctx_t *ctx, int fd, uint64_t user_data);
+int  uring_add_multishot_recv(uring_ctx_t *ctx, int fd, uint64_t user_data);
+int  uring_cancel(uring_ctx_t *ctx, uint64_t user_data);
+int  uring_submit(uring_ctx_t *ctx);
+
+/* Batched CQ drain */
+unsigned uring_cq_ready(uring_ctx_t *ctx);
+struct io_uring_cqe *uring_cqe_at(uring_ctx_t *ctx, unsigned idx);
+void uring_cq_advance(uring_ctx_t *ctx, unsigned n);
+
+int  uring_submit_and_flush(uring_ctx_t *ctx);
+void uring_flush(uring_ctx_t *ctx);
+
+int  uring_parse_recvmsg_cqe(uring_ctx_t *ctx, struct io_uring_cqe *cqe,
+                              uring_recv_buf_t *out);
+int  uring_parse_recv_cqe(uring_ctx_t *ctx, struct io_uring_cqe *cqe,
+                           uring_recv_buf_t *out);
+void uring_recycle_buf(uring_ctx_t *ctx, int buf_id);
+void uring_buf_ring_commit(uring_ctx_t *ctx);
+
+/* Global pointer — set in tunnel event loop, used by connection.cpp cleanup */
+extern uring_ctx_t *g_uring_ctx;
+
+#else /* !__linux__ */
+
+/* Stubs for non-Linux — always unavailable */
+struct uring_recv_buf_t { char *data; int len; int buf_id; };
+struct uring_ctx_t { int available; };
+static inline int uring_init(uring_ctx_t *ctx, int, int, int) { ctx->available = 0; return -1; }
+static inline void uring_destroy(uring_ctx_t *) {}
+
+/* Tag helpers still available for compilation */
+static inline uint64_t uring_tag(uint8_t type, uint64_t payload) {
+    return ((uint64_t)type << 56) | (payload & 0x00FFFFFFFFFFFFFFULL);
+}
+#define URING_TAG_CLIENT_LOCAL   0x01
+#define URING_TAG_CLIENT_REMOTE  0x02
+#define URING_TAG_SERVER_LOCAL   0x03
+#define URING_TAG_SERVER_REMOTE  0x04
+
+extern uring_ctx_t *g_uring_ctx;
+
+#endif /* __linux__ */
+#endif /* IO_URING_RECV_H_ */
diff --git a/lib/fec.cpp b/lib/fec.cpp
index 982a7f1..7b559f4 100644
--- a/lib/fec.cpp
+++ b/lib/fec.cpp
@@ -210,6 +210,27 @@ init_mul_table()
     for (j=0; j< GF_SIZE+1; j++)
 	    gf_mul_table[0][j] = gf_mul_table[j][0] = 0;
 }
+
+/*
+ * SIMD nibble lookup tables for GF(2^8) multiply-by-constant.
+ * For each constant c, lo_table[c][i] = c*i and hi_table[c][i] = c*(i<<4).
+ * This enables PSHUFB/TBL to process 16 bytes per instruction pair.
+ */
+static gf gf_lo_table[GF_SIZE + 1][16] __attribute__((aligned(16)));
+static gf gf_hi_table[GF_SIZE + 1][16] __attribute__((aligned(16)));
+
+static void
+init_simd_tables()
+{
+    int c, i;
+    for (c = 0; c <= GF_SIZE; c++) {
+	for (i = 0; i < 16; i++) {
+	    gf_lo_table[c][i] = gf_mul_table[c][i];
+	    gf_hi_table[c][i] = gf_mul_table[c][i << 4];
+	}
+    }
+}
+
 #else	/* GF_BITS > 8 */
 static inline gf
 gf_mul(x,y)
@@ -326,27 +347,308 @@ generate_gf(void)
 
 /*
  * addmul() computes dst[] = dst[] + c * src[]
- * This is used often, so better optimize it! Currently the loop is
- * unrolled 16 times, a good value for 486 and pentium-class machines.
- * The case c=0 is also optimized, whereas c=1 is not. These
- * calls are unfrequent in my typical apps so I did not bother.
- * 
- * Note that gcc on
+ *
+ * SIMD paths use nibble decomposition: c*x = lo_table[x & 0x0F] ^ hi_table[x >> 4]
+ * where each table has 16 entries fitting in one 128-bit SIMD register.
+ * PSHUFB (x86 SSSE3) / TBL (ARM NEON) performs 16 parallel lookups.
  */
 #define addmul(dst, src, c, sz) \
     if (c != 0) addmul1(dst, src, c, sz)
 
+#if defined(__x86_64__)
+#include <immintrin.h>
+#include <cpuid.h>
+
+static int cpu_has_avx2(void)
+{
+    unsigned int eax, ebx, ecx, edx;
+
+    /* OSXSAVE — OS supports XSAVE */
+    if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx))
+	return 0;
+    if (!(ecx & (1u << 27)))
+	return 0;
+
+    /* XCR0 bits 1-2 — OS saves SSE+AVX state */
+    unsigned int xcr0;
+    __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "edx");
+    if ((xcr0 & 0x6) != 0x6)
+	return 0;
+
+    /* AVX2: leaf 7, sub-leaf 0, EBX bit 5 */
+    if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx))
+	return 0;
+    return (ebx >> 5) & 1;
+}
+
+static int cpu_has_avx512bw(void)
+{
+    unsigned int eax, ebx, ecx, edx;
+
+    /* OSXSAVE — OS supports XSAVE */
+    if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx))
+	return 0;
+    if (!(ecx & (1u << 27)))
+	return 0;
+
+    /* XCR0 bits 1,2 (SSE+AVX) + 5,6,7 (opmask, ZMM_Hi256, Hi16_ZMM) */
+    unsigned int xcr0;
+    __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "edx");
+    if ((xcr0 & 0xE6) != 0xE6)
+	return 0;
+
+    /* AVX-512BW: leaf 7, sub-leaf 0, EBX bit 30 */
+    if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx))
+	return 0;
+    return (ebx >> 30) & 1;
+}
+
+__attribute__((target("ssse3")))
+static void
+addmul1_ssse3(gf *dst, gf *src, gf c, int sz)
+{
+    __m128i tbl_lo = _mm_load_si128((const __m128i *)gf_lo_table[c]);
+    __m128i tbl_hi = _mm_load_si128((const __m128i *)gf_hi_table[c]);
+    __m128i mask   = _mm_set1_epi8(0x0F);
+
+    int i = 0;
+    /* 2x unrolled: process 32 bytes per iteration for better ILP */
+    for (; i + 32 <= sz; i += 32) {
+	__m128i x1 = _mm_loadu_si128((const __m128i *)(src + i));
+	__m128i x2 = _mm_loadu_si128((const __m128i *)(src + i + 16));
+	__m128i lo1 = _mm_shuffle_epi8(tbl_lo, _mm_and_si128(x1, mask));
+	__m128i hi1 = _mm_shuffle_epi8(tbl_hi, _mm_and_si128(_mm_srli_epi64(x1, 4), mask));
+	__m128i lo2 = _mm_shuffle_epi8(tbl_lo, _mm_and_si128(x2, mask));
+	__m128i hi2 = _mm_shuffle_epi8(tbl_hi, _mm_and_si128(_mm_srli_epi64(x2, 4), mask));
+	__m128i d1 = _mm_loadu_si128((const __m128i *)(dst + i));
+	__m128i d2 = _mm_loadu_si128((const __m128i *)(dst + i + 16));
+	_mm_storeu_si128((__m128i *)(dst + i),
+		_mm_xor_si128(d1, _mm_xor_si128(lo1, hi1)));
+	_mm_storeu_si128((__m128i *)(dst + i + 16),
+		_mm_xor_si128(d2, _mm_xor_si128(lo2, hi2)));
+    }
+    /* 16-byte tail */
+    for (; i + 16 <= sz; i += 16) {
+	__m128i x = _mm_loadu_si128((const __m128i *)(src + i));
+	__m128i lo = _mm_shuffle_epi8(tbl_lo, _mm_and_si128(x, mask));
+	__m128i hi = _mm_shuffle_epi8(tbl_hi,
+		_mm_and_si128(_mm_srli_epi64(x, 4), mask));
+	__m128i d = _mm_loadu_si128((const __m128i *)(dst + i));
+	_mm_storeu_si128((__m128i *)(dst + i),
+		_mm_xor_si128(d, _mm_xor_si128(lo, hi)));
+    }
+
+    /* scalar tail */
+    USE_GF_MULC ;
+    GF_MULC0(c) ;
+    for (; i < sz; i++)
+	GF_ADDMULC(dst[i], src[i]);
+}
+
+__attribute__((target("avx2")))
+static void
+addmul1_avx2(gf *dst, gf *src, gf c, int sz)
+{
+    __m128i tbl128_lo = _mm_load_si128((const __m128i *)gf_lo_table[c]);
+    __m128i tbl128_hi = _mm_load_si128((const __m128i *)gf_hi_table[c]);
+    __m256i tbl_lo = _mm256_broadcastsi128_si256(tbl128_lo);
+    __m256i tbl_hi = _mm256_broadcastsi128_si256(tbl128_hi);
+    __m256i mask   = _mm256_set1_epi8(0x0F);
+
+    int i = 0;
+    /* 2x unrolled: process 64 bytes per iteration for better ILP */
+    for (; i + 64 <= sz; i += 64) {
+	__m256i x1  = _mm256_loadu_si256((const __m256i *)(src + i));
+	__m256i x2  = _mm256_loadu_si256((const __m256i *)(src + i + 32));
+	__m256i lo1 = _mm256_shuffle_epi8(tbl_lo, _mm256_and_si256(x1, mask));
+	__m256i hi1 = _mm256_shuffle_epi8(tbl_hi, _mm256_and_si256(_mm256_srli_epi64(x1, 4), mask));
+	__m256i lo2 = _mm256_shuffle_epi8(tbl_lo, _mm256_and_si256(x2, mask));
+	__m256i hi2 = _mm256_shuffle_epi8(tbl_hi, _mm256_and_si256(_mm256_srli_epi64(x2, 4), mask));
+	__m256i d1  = _mm256_loadu_si256((const __m256i *)(dst + i));
+	__m256i d2  = _mm256_loadu_si256((const __m256i *)(dst + i + 32));
+	_mm256_storeu_si256((__m256i *)(dst + i),
+		_mm256_xor_si256(d1, _mm256_xor_si256(lo1, hi1)));
+	_mm256_storeu_si256((__m256i *)(dst + i + 32),
+		_mm256_xor_si256(d2, _mm256_xor_si256(lo2, hi2)));
+    }
+    /* 32-byte tail */
+    for (; i + 32 <= sz; i += 32) {
+	__m256i x  = _mm256_loadu_si256((const __m256i *)(src + i));
+	__m256i lo = _mm256_shuffle_epi8(tbl_lo, _mm256_and_si256(x, mask));
+	__m256i hi = _mm256_shuffle_epi8(tbl_hi,
+		_mm256_and_si256(_mm256_srli_epi64(x, 4), mask));
+	__m256i d  = _mm256_loadu_si256((const __m256i *)(dst + i));
+	_mm256_storeu_si256((__m256i *)(dst + i),
+		_mm256_xor_si256(d, _mm256_xor_si256(lo, hi)));
+    }
+
+    /* SSE tail: at most one 16-byte chunk */
+    if (i + 16 <= sz) {
+	__m128i mx = _mm_set1_epi8(0x0F);
+	__m128i x  = _mm_loadu_si128((const __m128i *)(src + i));
+	__m128i lo = _mm_shuffle_epi8(tbl128_lo, _mm_and_si128(x, mx));
+	__m128i hi = _mm_shuffle_epi8(tbl128_hi,
+		_mm_and_si128(_mm_srli_epi64(x, 4), mx));
+	__m128i d  = _mm_loadu_si128((const __m128i *)(dst + i));
+	_mm_storeu_si128((__m128i *)(dst + i),
+		_mm_xor_si128(d, _mm_xor_si128(lo, hi)));
+	i += 16;
+    }
+
+    /* scalar tail */
+    USE_GF_MULC ;
+    GF_MULC0(c) ;
+    for (; i < sz; i++)
+	GF_ADDMULC(dst[i], src[i]);
+}
+
+__attribute__((target("avx512bw")))
+static void
+addmul1_avx512(gf *dst, gf *src, gf c, int sz)
+{
+    __m512i tbl_lo = _mm512_broadcast_i32x4(
+	_mm_load_si128((const __m128i *)gf_lo_table[c]));
+    __m512i tbl_hi = _mm512_broadcast_i32x4(
+	_mm_load_si128((const __m128i *)gf_hi_table[c]));
+    __m512i mask   = _mm512_set1_epi8(0x0F);
+
+    int i = 0;
+    /* 2x unrolled: process 128 bytes per iteration for better ILP */
+    for (; i + 128 <= sz; i += 128) {
+	__m512i x1  = _mm512_loadu_si512(src + i);
+	__m512i x2  = _mm512_loadu_si512(src + i + 64);
+	__m512i lo1 = _mm512_shuffle_epi8(tbl_lo, _mm512_and_si512(x1, mask));
+	__m512i hi1 = _mm512_shuffle_epi8(tbl_hi,
+		_mm512_and_si512(_mm512_srli_epi64(x1, 4), mask));
+	__m512i lo2 = _mm512_shuffle_epi8(tbl_lo, _mm512_and_si512(x2, mask));
+	__m512i hi2 = _mm512_shuffle_epi8(tbl_hi,
+		_mm512_and_si512(_mm512_srli_epi64(x2, 4), mask));
+	__m512i d1  = _mm512_loadu_si512(dst + i);
+	__m512i d2  = _mm512_loadu_si512(dst + i + 64);
+	_mm512_storeu_si512(dst + i,
+		_mm512_ternarylogic_epi64(d1, lo1, hi1, 0x96));
+	_mm512_storeu_si512(dst + i + 64,
+		_mm512_ternarylogic_epi64(d2, lo2, hi2, 0x96));
+    }
+    /* 64-byte tail */
+    for (; i + 64 <= sz; i += 64) {
+	__m512i x  = _mm512_loadu_si512(src + i);
+	__m512i lo = _mm512_shuffle_epi8(tbl_lo, _mm512_and_si512(x, mask));
+	__m512i hi = _mm512_shuffle_epi8(tbl_hi,
+		_mm512_and_si512(_mm512_srli_epi64(x, 4), mask));
+	__m512i d  = _mm512_loadu_si512(dst + i);
+	_mm512_storeu_si512(dst + i,
+		_mm512_ternarylogic_epi64(d, lo, hi, 0x96));
+    }
+
+    /* AVX2 tail: at most one 32-byte chunk */
+    if (i + 32 <= sz) {
+	__m256i tbl256_lo = _mm256_broadcastsi128_si256(
+		_mm_load_si128((const __m128i *)gf_lo_table[c]));
+	__m256i tbl256_hi = _mm256_broadcastsi128_si256(
+		_mm_load_si128((const __m128i *)gf_hi_table[c]));
+	__m256i m256 = _mm256_set1_epi8(0x0F);
+	__m256i x  = _mm256_loadu_si256((const __m256i *)(src + i));
+	__m256i lo = _mm256_shuffle_epi8(tbl256_lo, _mm256_and_si256(x, m256));
+	__m256i hi = _mm256_shuffle_epi8(tbl256_hi,
+		_mm256_and_si256(_mm256_srli_epi64(x, 4), m256));
+	__m256i d  = _mm256_loadu_si256((const __m256i *)(dst + i));
+	_mm256_storeu_si256((__m256i *)(dst + i),
+		_mm256_xor_si256(d, _mm256_xor_si256(lo, hi)));
+	i += 32;
+    }
+
+    /* SSE tail: at most one 16-byte chunk */
+    if (i + 16 <= sz) {
+	__m128i mx = _mm_set1_epi8(0x0F);
+	__m128i x  = _mm_loadu_si128((const __m128i *)(src + i));
+	__m128i lo = _mm_shuffle_epi8(
+		_mm_load_si128((const __m128i *)gf_lo_table[c]),
+		_mm_and_si128(x, mx));
+	__m128i hi = _mm_shuffle_epi8(
+		_mm_load_si128((const __m128i *)gf_hi_table[c]),
+		_mm_and_si128(_mm_srli_epi64(x, 4), mx));
+	__m128i d  = _mm_loadu_si128((const __m128i *)(dst + i));
+	_mm_storeu_si128((__m128i *)(dst + i),
+		_mm_xor_si128(d, _mm_xor_si128(lo, hi)));
+	i += 16;
+    }
+
+    /* scalar tail */
+    USE_GF_MULC ;
+    GF_MULC0(c) ;
+    for (; i < sz; i++)
+	GF_ADDMULC(dst[i], src[i]);
+}
+
+static void (*addmul1_x86_fn)(gf *, gf *, gf, int) = addmul1_ssse3;
+#endif /* __x86_64__ */
+
+#if defined(__aarch64__)
+#include <arm_neon.h>
+
+static void
+addmul1_neon(gf *dst, gf *src, gf c, int sz)
+{
+    uint8x16_t tbl_lo = vld1q_u8(gf_lo_table[c]);
+    uint8x16_t tbl_hi = vld1q_u8(gf_hi_table[c]);
+    uint8x16_t mask   = vdupq_n_u8(0x0F);
+
+    int i = 0;
+    for (; i + 32 <= sz; i += 32) {
+	uint8x16_t x1 = vld1q_u8(src + i);
+	uint8x16_t x2 = vld1q_u8(src + i + 16);
+	uint8x16_t lo1 = vqtbl1q_u8(tbl_lo, vandq_u8(x1, mask));
+	uint8x16_t hi1 = vqtbl1q_u8(tbl_hi, vshrq_n_u8(x1, 4));
+	uint8x16_t lo2 = vqtbl1q_u8(tbl_lo, vandq_u8(x2, mask));
+	uint8x16_t hi2 = vqtbl1q_u8(tbl_hi, vshrq_n_u8(x2, 4));
+	uint8x16_t d1 = vld1q_u8(dst + i);
+	uint8x16_t d2 = vld1q_u8(dst + i + 16);
+	vst1q_u8(dst + i,      veorq_u8(d1, veorq_u8(lo1, hi1)));
+	vst1q_u8(dst + i + 16, veorq_u8(d2, veorq_u8(lo2, hi2)));
+    }
+    for (; i + 16 <= sz; i += 16) {
+	uint8x16_t x = vld1q_u8(src + i);
+	uint8x16_t lo = vqtbl1q_u8(tbl_lo, vandq_u8(x, mask));
+	uint8x16_t hi = vqtbl1q_u8(tbl_hi, vshrq_n_u8(x, 4));
+	uint8x16_t d = vld1q_u8(dst + i);
+	vst1q_u8(dst + i, veorq_u8(d, veorq_u8(lo, hi)));
+    }
+
+    /* scalar tail */
+    USE_GF_MULC ;
+    GF_MULC0(c) ;
+    for (; i < sz; i++)
+	GF_ADDMULC(dst[i], src[i]);
+}
+#endif /* __aarch64__ */
+
 #define UNROLL 16 /* 1, 4, 8, 16 */
 static void
 addmul1(gf *dst1, gf *src1, gf c, int sz)
 {
+#if defined(__x86_64__)
+    addmul1_x86_fn(dst1, src1, c, sz);
+#elif defined(__aarch64__)
+    addmul1_neon(dst1, src1, c, sz);
+#else
+    /*
+     * Scalar fallback for MIPS, i486, ARMv7, etc.
+     *
+     * NOT auto-vectorizable: the 256-entry table lookup (gf_mulc_table[c][src[i]])
+     * is a data-dependent gather. The nibble decomposition that makes PSHUFB/TBL
+     * work requires GF(2^8) algebraic insight no compiler performs. Pragmas like
+     * omp simd, __restrict__, and -ftree-vectorize don't help — they grant
+     * permission to vectorize but can't transform the lookup. Deliberate choice.
+     */
+    if (sz <= 0) return;
     USE_GF_MULC ;
     gf *dst = dst1, *src = src1 ;
     gf *lim = &dst[sz - UNROLL + 1] ;
 
     GF_MULC0(c) ;
 
-#if (UNROLL > 1) /* unrolling by 8/16 is quite effective on the pentium */
+#if (UNROLL > 1)
     for (; dst < lim ; dst += UNROLL, src += UNROLL ) {
 	GF_ADDMULC( dst[0] , src[0] );
 	GF_ADDMULC( dst[1] , src[1] );
@@ -371,8 +673,9 @@ addmul1(gf *dst1, gf *src1, gf c, int sz)
     }
 #endif
     lim += UNROLL - 1 ;
-    for (; dst < lim; dst++, src++ )		/* final components */
+    for (; dst < lim; dst++, src++ )
 	GF_ADDMULC( *dst , *src );
+#endif /* architecture dispatch */
 }
 
 /*
@@ -429,13 +732,12 @@ invert_mat(gf *src, int k)
     int irow, icol, row, col, i, ix ;
 
     int error = 1 ;
-    int *indxc = (int*)my_malloc(k*sizeof(int), "indxc");
-    int *indxr = (int*)my_malloc(k*sizeof(int), "indxr");
-    int *ipiv = (int*)my_malloc(k*sizeof(int), "ipiv");
-    gf *id_row = NEW_GF_MATRIX(1, k);
-    gf *temp_row = NEW_GF_MATRIX(1, k);
+    int indxc[k];
+    int indxr[k];
+    int ipiv[k];
+    gf id_row[k];
 
-    bzero(id_row, k*sizeof(gf));
+    memset(id_row, 0, (unsigned)k * sizeof(gf));
     DEB( pivloops=0; pivswaps=0 ; /* diagnostic */ )
     /*
      * ipiv marks elements already used as pivots.
@@ -540,11 +842,6 @@ invert_mat(gf *src, int k)
     }
     error = 0 ;
 fail:
-    free(indxc);
-    free(indxr);
-    free(ipiv);
-    free(id_row);
-    free(temp_row);
     return error ;
 }
 
@@ -628,6 +925,15 @@ init_fec()
     init_mul_table();
     TOCK(ticks[0]);
     DDB(fprintf(stderr, "init_mul_table took %ldus\n", ticks[0]);)
+#if (GF_BITS <= 8)
+    init_simd_tables();
+#endif
+#if defined(__x86_64__)
+    if (cpu_has_avx512bw())
+	addmul1_x86_fn = addmul1_avx512;
+    else if (cpu_has_avx2())
+	addmul1_x86_fn = addmul1_avx2;
+#endif
     fec_initialized = 1 ;
 }
 
@@ -643,6 +949,9 @@ struct fec_parms {
     u_long magic ;
     int k, n ;		/* parameters of the code */
     gf *enc_matrix ;
+    gf *dec_matrix ;	/* k*k scratch for build_decode_matrix */
+    gf *dec_buf ;	/* k*dec_buf_sz scratch for fec_decode */
+    int dec_buf_sz ;	/* current sz capacity, 0 = not yet allocated */
 } ;
 
 void
@@ -655,6 +964,8 @@ fec_free(void *p0)
 	return ;
     }
     free(p->enc_matrix);
+    free(p->dec_matrix);
+    free(p->dec_buf);
     free(p);
 }
 
@@ -682,6 +993,9 @@ fec_new(int k, int n)
     retval->k = k ;
     retval->n = n ;
     retval->enc_matrix = NEW_GF_MATRIX(n, k);
+    retval->dec_matrix = NEW_GF_MATRIX(k, k);
+    retval->dec_buf = NULL ;
+    retval->dec_buf_sz = 0 ;
     retval->magic = ( ( FEC_MAGIC ^ k) ^ n) ^ (int)((long)retval->enc_matrix) ;
     tmp_m = NEW_GF_MATRIX(n, k);
     /*
@@ -792,11 +1106,11 @@ shuffle(gf *pkt[], int index[], int k)
  * indexes. The matrix must be already allocated as
  * a vector of k*k elements, in row-major order
  */
-static gf *
-build_decode_matrix(struct fec_parms *code, gf *pkt[], int index[])
+static int
+build_decode_matrix(struct fec_parms *code, gf *pkt[], int index[], gf *matrix)
 {
     int i , k = code->k ;
-    gf *p, *matrix = NEW_GF_MATRIX(k, k);
+    gf *p ;
 
     TICK(ticks[9]);
     for (i = 0, p = matrix ; i < k ; i++, p += k ) {
@@ -807,21 +1121,19 @@ build_decode_matrix(struct fec_parms *code, gf *pkt[], int index[])
 	} else
 #endif
 	if (index[i] < code->n )
-	    bcopy( &(code->enc_matrix[index[i]*k]), p, k*sizeof(gf) ); 
+	    bcopy( &(code->enc_matrix[index[i]*k]), p, k*sizeof(gf) );
 	else {
 	    fprintf(stderr, "decode: invalid index %d (max %d)\n",
 		index[i], code->n - 1 );
-	    free(matrix) ;
-	    return NULL ;
+	    return -1 ;
 	}
     }
     TICK(ticks[9]);
     if (invert_mat(matrix, k)) {
-	free(matrix);
-	matrix = NULL ;
+	return -1 ;
     }
     TOCK(ticks[9]);
-    return matrix ;
+    return 0 ;
 }
 
 /*
@@ -841,29 +1153,32 @@ fec_decode(void *code0, void *pkt0[], int index[], int sz)
 {
 	struct fec_parms * code=(struct fec_parms*)code0;
 	gf **pkt=(gf**)pkt0;
-    gf *m_dec ; 
-    gf **new_pkt ;
     int row, col , k = code->k ;
+    gf *new_pkt[k] ;
 
     if (GF_BITS > 8)
 	sz /= 2 ;
 
     if (shuffle(pkt, index, k))	/* error if true */
 	return 1 ;
-    m_dec = build_decode_matrix(code, pkt, index);
-
-    if (m_dec == NULL)
+    if (build_decode_matrix(code, pkt, index, code->dec_matrix))
 	return 1 ; /* error */
+
+    /* ensure decode scratch buffer is large enough */
+    if (code->dec_buf_sz < sz) {
+	free(code->dec_buf);
+	code->dec_buf = (gf *)my_malloc(k * sz * sizeof(gf), "dec_buf");
+	code->dec_buf_sz = sz ;
+    }
     /*
      * do the actual decoding
      */
-    new_pkt = (gf** )my_malloc (k * sizeof (gf * ), "new pkt pointers" );
     for (row = 0 ; row < k ; row++ ) {
 	if (index[row] >= k) {
-	    new_pkt[row] = (gf*)my_malloc (sz * sizeof (gf), "new pkt buffer" );
+	    new_pkt[row] = code->dec_buf + row * sz ;
 	    bzero(new_pkt[row], sz * sizeof(gf) ) ;
 	    for (col = 0 ; col < k ; col++ )
-		addmul(new_pkt[row], pkt[col], m_dec[row*k + col], sz) ;
+		addmul(new_pkt[row], pkt[col], code->dec_matrix[row*k + col], sz) ;
 	}
     }
     /*
@@ -872,11 +1187,8 @@ fec_decode(void *code0, void *pkt0[], int index[], int sz)
     for (row = 0 ; row < k ; row++ ) {
 	if (index[row] >= k) {
 	    bcopy(new_pkt[row], pkt[row], sz*sizeof(gf));
-	    free(new_pkt[row]);
 	}
     }
-    free(new_pkt);
-    free(m_dec);
 
     return 0;
 }
@@ -915,3 +1227,20 @@ test_gf()
     }
 }
 #endif /* TEST */
+
+#ifdef BENCH_EXPOSE_INTERNALS
+void bench_addmul1(gf *dst, gf *src, gf c, int sz) {
+    addmul1(dst, src, c, sz);
+}
+const char *bench_addmul1_impl() {
+#if defined(__x86_64__)
+    if (addmul1_x86_fn == addmul1_avx512) return "avx512bw";
+    if (addmul1_x86_fn == addmul1_avx2) return "avx2";
+    return "ssse3";
+#elif defined(__aarch64__)
+    return "neon";
+#else
+    return "scalar";
+#endif
+}
+#endif
diff --git a/makefile b/makefile
index 5aa9cab..7520c13 100755
--- a/makefile
+++ b/makefile
@@ -10,12 +10,16 @@ cc_amd64=/toolchains/lede-sdk-17.01.2-x86-64_gcc-5.4.0_musl-1.1.16.Linux-x86_64/
 #cc_bcm2708=/home/wangyu/raspberry/tools/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian/bin/arm-linux-gnueabihf-g++ 
 
 
-SOURCES0=main.cpp log.cpp common.cpp lib/fec.cpp lib/rs.cpp crc32/Crc32.cpp packet.cpp delay_manager.cpp fd_manager.cpp connection.cpp fec_manager.cpp misc.cpp tunnel_client.cpp tunnel_server.cpp
-SOURCES=${SOURCES0} my_ev.cpp -isystem libev 
+SOURCES0=main.cpp log.cpp common.cpp lib/fec.cpp lib/rs.cpp packet.cpp packet_cook.cpp delay_manager.cpp fd_manager.cpp connection.cpp fec_manager.cpp misc.cpp tunnel_client.cpp tunnel_server.cpp io_uring_recv.cpp xor_spe.S
+SOURCES=${SOURCES0} my_ev.cpp -isystem libev
 NAME=speederv2
 
 
-FLAGS= -std=c++11   -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers ${OPT}
+FLAGS= -std=c++11   -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers -MMD -MP ${OPT}
+
+ifdef SPE
+FLAGS += -DHAVE_PPC_SPE -Wa,-mspe
+endif
 
 TARGETS=amd64 arm mips24kc_be x86  mips24kc_le
 
@@ -110,10 +114,44 @@ release2: ${TARGETS} mingw_cross mingw_cross_wepoll mac_cross
 	cp git_version.h version.txt
 	tar -zcvf ${TAR} ${NAME}.exe ${NAME}_wepoll.exe ${NAME}_mac
 
-clean:	
+clean:
 	rm -f ${TAR}
 	rm -f ${NAME} ${NAME}_cross ${NAME}.exe ${NAME}_wepoll.exe ${NAME}_mac
 	rm -f git_version.h
+	rm -f *.d bench/*.d lib/*.d crc32/*.d
+
+-include $(wildcard *.d bench/*.d lib/*.d crc32/*.d)
 
 git_version:
 	    echo "const char *gitversion = \"$(shell git rev-parse HEAD)\";" > git_version.h
+
+# --- Benchmark and test targets ---
+BENCH_SOURCES=bench/bench_main.cpp bench/bench_fec.cpp bench/bench_crc32.cpp bench/bench_packet.cpp lib/fec.cpp lib/rs.cpp crc32/Crc32.cpp packet_cook.cpp xor_spe.S
+TEST_SOURCES=bench/test_main.cpp bench/test_fec.cpp bench/test_crc32.cpp bench/test_packet.cpp lib/fec.cpp lib/rs.cpp crc32/Crc32.cpp packet_cook.cpp xor_spe.S
+BENCH_FLAGS=-std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers -O2 -DBENCH_EXPOSE_INTERNALS -MMD -MP
+
+ifdef SPE
+BENCH_FLAGS += -DHAVE_PPC_SPE -Wa,-mspe
+endif
+
+bench: git_version
+	${cc_local} -o bench_udpspeeder -I. -Ibench ${BENCH_SOURCES} ${BENCH_FLAGS}
+
+test: git_version
+	${cc_local} -o test_udpspeeder -I. -Ibench ${TEST_SOURCES} ${BENCH_FLAGS}
+	./test_udpspeeder
+
+bench-static: git_version
+	${cc_local} -o bench_udpspeeder_static -I. -Ibench ${BENCH_SOURCES} ${BENCH_FLAGS} -static
+
+test-static: git_version
+	${cc_local} -o test_udpspeeder_static -I. -Ibench ${TEST_SOURCES} ${BENCH_FLAGS} -static
+
+bench-cross: git_version
+	${CC} -o bench_udpspeeder_cross -I. -Ibench ${BENCH_SOURCES} ${BENCH_FLAGS} -static -lgcc_eh
+
+test-cross: git_version
+	${CC} -o test_udpspeeder_cross -I. -Ibench ${TEST_SOURCES} ${BENCH_FLAGS} -static -lgcc_eh
+
+all-cross: git_version
+	${CC} -o ${NAME}_cross -I. ${SOURCES} ${FLAGS} -lrt -static -lgcc_eh -O2
diff --git a/misc.cpp b/misc.cpp
index 5918e57..34da625 100644
--- a/misc.cpp
+++ b/misc.cpp
@@ -13,7 +13,7 @@ int mtu_warn = 1350;
 
 int disable_mtu_warn = 1;
 int disable_fec = 0;
-int disable_checksum = 0;
+/* disable_checksum now lives in cook_ctx (packet.cpp) */
 
 int debug_force_flush_fec = 0;
 
@@ -207,6 +207,25 @@ int from_fec_to_normal(conn_info_t &conn_info, char *data, int len, int &out_n,
     return 0;
 }
 
+int delay_send_batch(int n, my_time_t *delays, const dest_t &dest, char **data_arr, int *len_arr) {
+    if (n <= 0) return 0;
+
+    /* Fast path: all delays zero and no random_drop → single sendmmsg */
+    if (n > 1 && random_drop == 0) {
+        int all_zero = 1;
+        for (int i = 0; i < n; i++) {
+            if (delays[i] != 0) { all_zero = 0; break; }
+        }
+        if (all_zero)
+            return my_send_batch(dest, data_arr, len_arr, n);
+    }
+
+    /* Slow path: individual sends with per-packet delay/drop */
+    for (int i = 0; i < n; i++)
+        delay_send(delays[i], dest, data_arr[i], len_arr[i]);
+    return 0;
+}
+
 int delay_send(my_time_t delay, const dest_t &dest, char *data, int len) {
     // int rand=random()%100;
     // mylog(log_info,"rand = %d\n",rand);
@@ -625,12 +644,13 @@ void process_arg(int argc, char *argv[]) {
         // opt_key+=opt;
         switch (opt) {
             case 'k':
-                sscanf(optarg, "%s\n", key_string);
-                mylog(log_debug, "key=%s\n", key_string);
-                if (strlen(key_string) == 0) {
+                sscanf(optarg, "%s\n", cook_ctx.key);
+                mylog(log_debug, "key=%s\n", cook_ctx.key);
+                if (strlen(cook_ctx.key) == 0) {
                     mylog(log_fatal, "key len=0??\n");
                     myexit(-1);
                 }
+                cook_ctx_prepare_key(&cook_ctx);
                 break;
             case 'j':
                 if (strchr(optarg, ':') == 0) {
@@ -723,12 +743,12 @@ void process_arg(int argc, char *argv[]) {
                     disable_fec = 1;
                 } else if (strcmp(long_options[option_index].name, "disable-obscure") == 0) {
                     mylog(log_info, "obscure disabled\n");
-                    disable_obscure = 1;
+                    cook_ctx.disable_obscure = 1;
                 } else if (strcmp(long_options[option_index].name, "disable-xor") == 0) {
                     mylog(log_info, "xor disabled\n");
-                    disable_xor = 1;
+                    cook_ctx.disable_xor = 1;
                 } else if (strcmp(long_options[option_index].name, "disable-checksum") == 0) {
-                    disable_checksum = 1;
+                    cook_ctx.disable_checksum = 1;
                     mylog(log_warn, "checksum disabled\n");
                 } else if (strcmp(long_options[option_index].name, "fix-latency") == 0) {
                     mylog(log_info, "fix-latency enabled\n");
diff --git a/misc.h b/misc.h
index 6ed637e..5350ad9 100644
--- a/misc.h
+++ b/misc.h
@@ -20,7 +20,7 @@ extern int mtu_warn;
 
 extern int disable_mtu_warn;
 extern int disable_fec;
-extern int disable_checksum;
+/* disable_checksum now lives in cook_ctx_t (packet_cook.h) */
 
 extern int debug_force_flush_fec;
 
@@ -62,6 +62,7 @@ int from_normal_to_fec(conn_info_t &conn_info, char *data, int len, int &out_n,
 int from_fec_to_normal(conn_info_t &conn_info, char *data, int len, int &out_n, char **&out_arr, int *&out_len, my_time_t *&out_delay);
 
 int delay_send(my_time_t delay, const dest_t &dest, char *data, int len);
+int delay_send_batch(int n, my_time_t *delays, const dest_t &dest, char **data_arr, int *len_arr);
 int print_parameter();
 int handle_command(char *s);
 
diff --git a/packet.cpp b/packet.cpp
index 87519c3..d1affff 100644
--- a/packet.cpp
+++ b/packet.cpp
@@ -9,10 +9,10 @@
 #include "log.h"
 #include "packet.h"
 #include "misc.h"
-#include "crc32/Crc32.h"
+#include "crc32c.h"
+
+cook_ctx_t cook_ctx = { {}, 0, 0, {}, 4, 32, 0, 0, 0 };
 
-int iv_min = 4;
-int iv_max = 32;  //< 256;
 u64_t packet_send_count = 0;
 u64_t dup_packet_send_count = 0;
 u64_t packet_recv_count = 0;
@@ -21,115 +21,8 @@ u64_t dup_packet_recv_count = 0;
 typedef u64_t anti_replay_seq_t;
 int disable_replay_filter = 0;
 
-int disable_obscure = 0;
-int disable_xor = 0;
-
 int random_drop = 0;
 
-char key_string[1000] = "";
-
-// int local_listen_fd=-1;
-
-void encrypt_0(char *input, int &len, char *key) {
-    int i, j;
-    if (key[0] == 0) return;
-    for (i = 0, j = 0; i < len; i++, j++) {
-        if (key[j] == 0) j = 0;
-        input[i] ^= key[j];
-    }
-}
-
-void decrypt_0(char *input, int &len, char *key) {
-    int i, j;
-    if (key[0] == 0) return;
-    for (i = 0, j = 0; i < len; i++, j++) {
-        if (key[j] == 0) j = 0;
-        input[i] ^= key[j];
-    }
-}
-int do_obscure_old(const char *input, int in_len, char *output, int &out_len) {
-    // memcpy(output,input,in_len);
-    //	out_len=in_len;
-    // return 0;
-
-    int i, j, k;
-    if (in_len > 65535 || in_len < 0)
-        return -1;
-    int iv_len = iv_min + rand() % (iv_max - iv_min);
-    get_fake_random_chars(output, iv_len);
-    memcpy(output + iv_len, input, in_len);
-
-    output[iv_len + in_len] = (uint8_t)iv_len;
-
-    output[iv_len + in_len] ^= output[0];
-    output[iv_len + in_len] ^= key_string[0];
-
-    for (i = 0, j = 0, k = 1; i < in_len; i++, j++, k++) {
-        if (j == iv_len) j = 0;
-        if (key_string[k] == 0) k = 0;
-        output[iv_len + i] ^= output[j];
-        output[iv_len + i] ^= key_string[k];
-    }
-
-    out_len = iv_len + in_len + 1;
-    return 0;
-}
-
-int do_obscure(char *data, int &len) {
-    assert(len >= 0);
-    assert(len < buf_len);
-
-    int iv_len = random_between(iv_min, iv_max);
-    get_fake_random_chars(data + len, iv_len);
-    data[iv_len + len] = (uint8_t)iv_len;
-    for (int i = 0, j = 0; i < len; i++, j++) {
-        if (j == iv_len) j = 0;
-        data[i] ^= data[len + j];
-    }
-
-    len = len + iv_len + 1;
-    return 0;
-}
-
-int de_obscure(char *data, int &len) {
-    if (len < 1) return -1;
-    int iv_len = int((uint8_t)data[len - 1]);
-
-    if (len < 1 + iv_len) return -1;
-
-    len = len - 1 - iv_len;
-    for (int i = 0, j = 0; i < len; i++, j++) {
-        if (j == iv_len) j = 0;
-        data[i] ^= data[len + j];
-    }
-
-    return 0;
-}
-int de_obscure_old(const char *input, int in_len, char *output, int &out_len) {
-    // memcpy(output,input,in_len);
-    // out_len=in_len;
-    // return 0;
-
-    int i, j, k;
-    if (in_len > 65535 || in_len < 0) {
-        mylog(log_debug, "in_len > 65535||in_len<0 ,  %d", in_len);
-        return -1;
-    }
-    int iv_len = int((uint8_t)(input[in_len - 1] ^ input[0] ^ key_string[0]));
-    out_len = in_len - 1 - iv_len;
-    if (out_len < 0) {
-        mylog(log_debug, "%d %d\n", in_len, out_len);
-        return -1;
-    }
-    for (i = 0, j = 0, k = 1; i < in_len; i++, j++, k++) {
-        if (j == iv_len) j = 0;
-        if (key_string[k] == 0) k = 0;
-        output[i] = input[iv_len + i] ^ input[j] ^ key_string[k];
-    }
-    dup_packet_recv_count++;
-    return 0;
-}
-
 /*
 int sendto_fd_ip_port (int fd,u32_t ip,int port,char * buf, int len,int flags)
 {
@@ -163,9 +56,91 @@ int send_fd(int fd, char *buf, int len, int flags) {
     return send(fd, buf, len, flags);
 }
 
+int my_send_batch(const dest_t &dest, char **data_arr, int *len_arr, int count) {
+    if (count <= 0) return 0;
+    if (count == 1) return my_send(dest, data_arr[0], len_arr[0]);
+
+    /* Cook all packets */
+    if (dest.cook) {
+        for (int i = 0; i < count; i++)
+            do_cook(&cook_ctx, data_arr[i], len_arr[i]);
+    }
+
+    /* Resolve fd and optional destination address.
+     * Copy address out of const dest (same as sendto_fd_addr taking addr by value). */
+    int fd;
+    address_t addr_copy;
+    struct sockaddr *addr_ptr = NULL;
+    socklen_t addr_len = 0;
+
+    switch (dest.type) {
+        case type_fd_addr:
+            fd = dest.inner.fd_addr.fd;
+            addr_copy = dest.inner.fd_addr.addr;
+            addr_ptr = (struct sockaddr *)&addr_copy.inner;
+            addr_len = addr_copy.get_len();
+            break;
+        case type_fd64_addr:
+            if (!fd_manager.exist(dest.inner.fd64)) return -1;
+            fd = fd_manager.to_fd(dest.inner.fd64);
+            addr_copy = dest.inner.fd64_addr.addr;
+            addr_ptr = (struct sockaddr *)&addr_copy.inner;
+            addr_len = addr_copy.get_len();
+            break;
+        case type_fd64:
+            if (!fd_manager.exist(dest.inner.fd64)) return -1;
+            fd = fd_manager.to_fd(dest.inner.fd64);
+            break;
+        case type_fd:
+            fd = dest.inner.fd;
+            break;
+        default:
+            for (int i = 0; i < count; i++)
+                my_send(dest, data_arr[i], len_arr[i]);
+            return count;
+    }
+
+#ifdef __linux__
+    struct mmsghdr msgs[max_fec_packet_num];
+    struct iovec iovecs[max_fec_packet_num];
+
+    for (int i = 0; i < count; i++) {
+        iovecs[i].iov_base = data_arr[i];
+        iovecs[i].iov_len = len_arr[i];
+        msgs[i].msg_hdr.msg_iov = &iovecs[i];
+        msgs[i].msg_hdr.msg_iovlen = 1;
+        msgs[i].msg_hdr.msg_name = addr_ptr;
+        msgs[i].msg_hdr.msg_namelen = addr_len;
+        msgs[i].msg_hdr.msg_control = NULL;
+        msgs[i].msg_hdr.msg_controllen = 0;
+        msgs[i].msg_hdr.msg_flags = 0;
+        msgs[i].msg_len = 0;
+    }
+
+    int ret = sendmmsg(fd, msgs, count, 0);
+    if (ret < 0) {
+        mylog(log_warn, "sendmmsg failed: %s\n", strerror(errno));
+    } else if (ret < count) {
+        mylog(log_debug, "sendmmsg partial: %d/%d sent\n", ret, count);
+    }
+    return ret;
+#else
+    int sent = 0;
+    for (int i = 0; i < count; i++) {
+        int ret;
+        if (addr_ptr)
+            ret = sendto(fd, data_arr[i], len_arr[i], 0, addr_ptr, addr_len);
+        else
+            ret = send(fd, data_arr[i], len_arr[i], 0);
+        if (ret >= 0) sent++;
+    }
+    return sent;
+#endif
+}
+
 int my_send(const dest_t &dest, char *data, int len) {
     if (dest.cook) {
-        do_cook(data, len);
+        do_cook(&cook_ctx, data, len);
     }
     switch (dest.type) {
         case type_fd_addr: {
@@ -238,7 +213,7 @@ int put_conv0(u32_t conv, const char *input, int len_in, char *&output, int &len
     u32_t n_conv = htonl(conv);
     memcpy(output, &n_conv, sizeof(n_conv));
     memcpy(output + sizeof(n_conv), input, len_in);
-    u32_t crc32 = (u32_t)crc32_fast(output, len_in + sizeof(crc32));
+    u32_t crc32 = (u32_t)crc32c(output, len_in + sizeof(crc32));
     u32_t crc32_n = htonl(crc32);
     len_out = len_in + (int)(sizeof(n_conv)) + (int)sizeof(crc32_n);
     memcpy(output + len_in + (int)(sizeof(n_conv)), &crc32_n, sizeof(crc32_n));
@@ -258,56 +233,12 @@ int get_conv0(u32_t &conv, const char *input, int len_in, char *&output, int &le
     }
     memcpy(&crc32_n, input + len_in - (int)sizeof(crc32_n), sizeof(crc32_n));
     u32_t crc32 = ntohl(crc32_n);
-    if (crc32 != (u32_t)crc32_fast(input, len_in - sizeof(crc32_n))) {
+    if (crc32 != (u32_t)crc32c(input, len_in - sizeof(crc32_n))) {
         mylog(log_debug, "crc32 check failed\n");
         return -1;
     }
     return 0;
 }
-int put_crc32(char *s, int &len) {
-    if (disable_checksum) return 0;
-    assert(len >= 0);
-    // if(len<0) return -1;
-    u32_t crc32 = (u32_t)crc32_fast(s, len);
-    write_u32(s + len, crc32);
-    len += sizeof(u32_t);
-
-    return 0;
-}
-
-int do_cook(char *data, int &len) {
-    put_crc32(data, len);
-    if (!disable_obscure) do_obscure(data, len);
-    if (!disable_xor) encrypt_0(data, len, key_string);
-    return 0;
-}
-
-int de_cook(char *s, int &len) {
-    if (!disable_xor) decrypt_0(s, len, key_string);
-    if (!disable_obscure) {
-        int ret = de_obscure(s, len);
-        if (ret != 0) {
-            mylog(log_debug, "de_obscure fail\n");
-            return ret;
-        }
-    }
-    int ret = rm_crc32(s, len);
-    if (ret != 0) {
-        mylog(log_debug, "rm_crc32 fail\n");
-        return ret;
-    }
-    return 0;
-}
-int rm_crc32(char *s, int &len) {
-    if (disable_checksum) return 0;
-    assert(len >= 0);
-    len -= sizeof(u32_t);
-    if (len < 0) return -1;
-    u32_t crc32_in = read_u32(s + len);
-    u32_t crc32 = (u32_t)crc32_fast(s, len);
-    if (crc32 != crc32_in) return -1;
-    return 0;
-}
 /*
 int do_obs()
 {
@@ -324,6 +255,14 @@ int put_conv(u32_t conv, const char *input, int len_in, char *&output, int &len_
 
     return 0;
 }
+int put_conv_inplace(u32_t conv, char *buf, int data_len, int &len_out) {
+    /* buf must have data at buf+sizeof(u32_t) with sizeof(u32_t) bytes of headroom.
+     * Writes conv header at buf[0..3], total len = data_len + 4. */
+    u32_t n_conv = htonl(conv);
+    memcpy(buf, &n_conv, sizeof(n_conv));
+    len_out = data_len + (int)sizeof(n_conv);
+    return 0;
+}
 int get_conv(u32_t &conv, const char *input, int len_in, char *&output, int &len_out) {
     u32_t n_conv;
     memcpy(&n_conv, input, sizeof(n_conv));
diff --git a/packet.h b/packet.h
index b8d2582..1cdd327 100644
--- a/packet.h
+++ b/packet.h
@@ -10,37 +10,29 @@
 
 #include "common.h"
 #include "fd_manager.h"
+#include "packet_cook.h"
 
-extern int iv_min;
-extern int iv_max;  //< 256;
+extern cook_ctx_t cook_ctx;
 
 extern u64_t packet_send_count;
 extern u64_t dup_packet_send_count;
 extern u64_t packet_recv_count;
 extern u64_t dup_packet_recv_count;
-extern char key_string[1000];
 extern int disable_replay_filter;
 extern int random_drop;
-extern int disable_obscure;
-extern int disable_xor;
 
 int my_send(const dest_t &dest, char *data, int len);
+int my_send_batch(const dest_t &dest, char **data_arr, int *len_arr, int count);
 
-void encrypt_0(char *input, int &len, char *key);
-void decrypt_0(char *input, int &len, char *key);
 int add_seq(char *data, int &data_len);
 int remove_seq(char *data, int &data_len);
-int do_obscure(const char *input, int in_len, char *output, int &out_len);
-int de_obscure(const char *input, int in_len, char *output, int &out_len);
 
-// int sendto_fd_u64 (int fd,u64_t u64,char * buf, int len,int flags);
 int sendto_ip_port(u32_t ip, int port, char *buf, int len, int flags);
 int send_fd(int fd, char *buf, int len, int flags);
 
 int put_conv(u32_t conv, const char *input, int len_in, char *&output, int &len_out);
+int put_conv_inplace(u32_t conv, char *buf, int data_len, int &len_out);
 int get_conv(u32_t &conv, const char *input, int len_in, char *&output, int &len_out);
-int put_crc32(char *s, int &len);
-int rm_crc32(char *s, int &len);
-int do_cook(char *data, int &len);
-int de_cook(char *s, int &len);
+int put_conv0(u32_t conv, const char *input, int len_in, char *&output, int &len_out);
+int get_conv0(u32_t &conv, const char *input, int len_in, char *&output, int &len_out);
 #endif /* PACKET_H_ */
diff --git a/packet_cook.cpp b/packet_cook.cpp
new file mode 100644
index 0000000..be8017d
--- /dev/null
+++ b/packet_cook.cpp
@@ -0,0 +1,459 @@
+#include "packet_cook.h"
+#include "crc32c.h"
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#if defined(__x86_64__) || defined(_M_X64)
+#include <emmintrin.h>  /* SSE2 — baseline on all x86_64 */
+#include <immintrin.h>  /* AVX2 — for xor_tile_avx2 (guarded by target attr) */
+#define COOK_VEC_WIDTH 16
+#elif defined(__aarch64__)
+#include <arm_neon.h>
+#define COOK_VEC_WIDTH 16
+#elif defined(HAVE_PPC_SPE)
+#define COOK_VEC_WIDTH 8
+#else
+#define COOK_VEC_WIDTH ((int)sizeof(unsigned long))
+#endif
+
+#ifdef HAVE_PPC_SPE
+extern "C" void xor_tile_spe(char *data, int len, const char *tile, int tile_len);
+#endif
+
+/* Provided by common.cpp in production, stubs in bench */
+extern "C++" void get_fake_random_chars(char *s, int len);
+extern "C++" int random_between(uint32_t a, uint32_t b);
+
+static const int cook_buf_len = 3800; /* matches common.h buf_len */
+
+static void
+cook_write_u32(char *p, uint32_t l)
+{
+    *(unsigned char *)(p + 3) = (unsigned char)((l >> 0) & 0xff);
+    *(unsigned char *)(p + 2) = (unsigned char)((l >> 8) & 0xff);
+    *(unsigned char *)(p + 1) = (unsigned char)((l >> 16) & 0xff);
+    *(unsigned char *)(p + 0) = (unsigned char)((l >> 24) & 0xff);
+}
+
+static uint32_t
+cook_read_u32(char *p)
+{
+    uint32_t res;
+    res = *(const unsigned char *)(p + 0);
+    res = *(const unsigned char *)(p + 1) + (res << 8);
+    res = *(const unsigned char *)(p + 2) + (res << 8);
+    res = *(const unsigned char *)(p + 3) + (res << 8);
+    return res;
+}
+
+/* --- SIMD repeating-pattern XOR ----------------------------------------- */
+
+static int
+cook_gcd(int a, int b)
+{
+    while (b) { int t = b; b = a % b; a = t; }
+    return a;
+}
+
+static int
+cook_lcm(int a, int b)
+{
+    return a / cook_gcd(a, b) * b;
+}
+
+/*
+ * Fill tile[0..tile_len-1] with pat[0..pat_len-1] repeating.
+ * tile_len must be a multiple of pat_len.
+ */
+static void
+expand_tile(char *tile, int tile_len, const char *pat, int pat_len)
+{
+    memcpy(tile, pat, pat_len);
+    int filled = pat_len;
+    while (filled < tile_len) {
+        int chunk = tile_len - filled;
+        if (chunk > filled) chunk = filled;
+        memcpy(tile + filled, tile, chunk);
+        filled += chunk;
+    }
+}
+
+/*
+ * XOR data[0..len-1] with tile[0..tile_len-1] repeating.
+ * tile_len MUST be a multiple of COOK_VEC_WIDTH.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+__attribute__((target("avx2")))
+static void
+xor_tile_avx2(char *data, int len, const char *tile, int tile_len)
+{
+    int t = 0, i = 0;
+    if (tile_len == 16) {
+        /* Common case: broadcast 16-byte tile to 256-bit, no wrap logic */
+        __m256i tile256 = _mm256_broadcastsi128_si256(
+            _mm_loadu_si128((const __m128i *)tile));
+        for (; i + 32 <= len; i += 32) {
+            __m256i d = _mm256_loadu_si256((const __m256i *)(data + i));
+            _mm256_storeu_si256((__m256i *)(data + i),
+                _mm256_xor_si256(d, tile256));
+        }
+        /* t stays 0: i is multiple of 32, tile_len=16, so (i % 16) == 0 */
+    } else {
+        /* General case: tile_len is a multiple of 16 */
+        for (; i + 32 <= len; i += 32) {
+            __m128i k1 = _mm_loadu_si128((const __m128i *)(tile + t));
+            t += 16;
+            if (t >= tile_len) t -= tile_len;
+            __m128i k2 = _mm_loadu_si128((const __m128i *)(tile + t));
+            t += 16;
+            if (t >= tile_len) t -= tile_len;
+            __m256i key = _mm256_set_m128i(k2, k1);
+            __m256i d = _mm256_loadu_si256((const __m256i *)(data + i));
+            _mm256_storeu_si256((__m256i *)(data + i),
+                _mm256_xor_si256(d, key));
+        }
+    }
+    /* SSE2 tail */
+    for (; i + 16 <= len; i += 16) {
+        __m128i d = _mm_loadu_si128((const __m128i *)(data + i));
+        __m128i k = _mm_loadu_si128((const __m128i *)(tile + t));
+        _mm_storeu_si128((__m128i *)(data + i), _mm_xor_si128(d, k));
+        t += 16;
+        if (t >= tile_len) t = 0;
+    }
+    /* scalar tail */
+    for (; i < len; i++) {
+        data[i] ^= tile[t];
+        if (++t >= tile_len) t = 0;
+    }
+}
+
+__attribute__((target("avx512bw")))
+static void
+xor_tile_avx512(char *data, int len, const char *tile, int tile_len)
+{
+    int t = 0, i = 0;
+    if (tile_len == 16) {
+        /* Common case: broadcast 16-byte tile to 512-bit, no wrap logic */
+        __m512i tile512 = _mm512_broadcast_i32x4(
+            _mm_loadu_si128((const __m128i *)tile));
+        for (; i + 64 <= len; i += 64) {
+            __m512i d = _mm512_loadu_si512(data + i);
+            _mm512_storeu_si512(data + i, _mm512_xor_si512(d, tile512));
+        }
+        /* t stays 0: i is multiple of 64, tile_len=16, so (i % 16) == 0 */
+    } else {
+        /* General case: tile_len is a multiple of 16 */
+        for (; i + 64 <= len; i += 64) {
+            __m128i k1 = _mm_loadu_si128((const __m128i *)(tile + t));
+            t += 16; if (t >= tile_len) t -= tile_len;
+            __m128i k2 = _mm_loadu_si128((const __m128i *)(tile + t));
+            t += 16; if (t >= tile_len) t -= tile_len;
+            __m128i k3 = _mm_loadu_si128((const __m128i *)(tile + t));
+            t += 16; if (t >= tile_len) t -= tile_len;
+            __m128i k4 = _mm_loadu_si128((const __m128i *)(tile + t));
+            t += 16; if (t >= tile_len) t -= tile_len;
+            __m512i key = _mm512_castsi128_si512(k1);
+            key = _mm512_inserti32x4(key, k2, 1);
+            key = _mm512_inserti32x4(key, k3, 2);
+            key = _mm512_inserti32x4(key, k4, 3);
+            __m512i d = _mm512_loadu_si512(data + i);
+            _mm512_storeu_si512(data + i, _mm512_xor_si512(d, key));
+        }
+    }
+    /* AVX2 tail */
+    if (i + 32 <= len) {
+        __m128i k1 = _mm_loadu_si128((const __m128i *)(tile + t));
+        t += 16; if (t >= tile_len) t -= tile_len;
+        __m128i k2 = _mm_loadu_si128((const __m128i *)(tile + t));
+        t += 16; if (t >= tile_len) t -= tile_len;
+        __m256i key = _mm256_set_m128i(k2, k1);
+        __m256i d = _mm256_loadu_si256((const __m256i *)(data + i));
+        _mm256_storeu_si256((__m256i *)(data + i),
+            _mm256_xor_si256(d, key));
+        i += 32;
+    }
+    /* SSE2 tail */
+    if (i + 16 <= len) {
+        __m128i d = _mm_loadu_si128((const __m128i *)(data + i));
+        __m128i k = _mm_loadu_si128((const __m128i *)(tile + t));
+        _mm_storeu_si128((__m128i *)(data + i), _mm_xor_si128(d, k));
+        t += 16; if (t >= tile_len) t -= tile_len;
+        i += 16;
+    }
+    /* scalar tail */
+    for (; i < len; i++) {
+        data[i] ^= tile[t];
+        if (++t >= tile_len) t = 0;
+    }
+}
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+/* Runtime SIMD tier: 0=SSE2, 1=AVX2, 2=AVX-512BW */
+static int xor_simd_tier = -1;
+#endif
+
+static void
+xor_tile(char *data, int len, const char *tile, int tile_len)
+{
+#if defined(__x86_64__) || defined(_M_X64)
+    if (xor_simd_tier < 0) {
+        unsigned int eax, ebx, ecx, edx;
+        xor_simd_tier = 0;
+        /* Check AVX2: CPUID leaf 7, EBX bit 5 */
+        __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(7), "c"(0));
+        if ((ebx >> 5) & 1)
+            xor_simd_tier = 1;
+        /* Check AVX-512BW: OSXSAVE + XCR0 + CPUID leaf 7, EBX bit 30 */
+        if (xor_simd_tier >= 1) {
+            __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(1), "c"(0));
+            if (ecx & (1u << 27)) { /* OSXSAVE */
+                unsigned int xcr0;
+                __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "edx");
+                if ((xcr0 & 0xE6) == 0xE6) { /* SSE+AVX+opmask+ZMM */
+                    __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(7), "c"(0));
+                    if ((ebx >> 30) & 1)
+                        xor_simd_tier = 2;
+                }
+            }
+        }
+    }
+    if (xor_simd_tier >= 2) {
+        xor_tile_avx512(data, len, tile, tile_len);
+        return;
+    }
+    if (xor_simd_tier >= 1) {
+        xor_tile_avx2(data, len, tile, tile_len);
+        return;
+    }
+    int t = 0, i = 0;
+    for (; i + 16 <= len; i += 16) {
+        __m128i d = _mm_loadu_si128((const __m128i *)(data + i));
+        __m128i k = _mm_loadu_si128((const __m128i *)(tile + t));
+        _mm_storeu_si128((__m128i *)(data + i), _mm_xor_si128(d, k));
+        t += 16;
+        if (t >= tile_len) t = 0;
+    }
+    for (; i < len; i++) {
+        data[i] ^= tile[t];
+        t++;
+        if (t >= tile_len) t = 0;
+    }
+#elif defined(__aarch64__)
+    int t = 0, i = 0;
+    for (; i + 32 <= len; i += 32) {
+        uint8x16_t d1 = vld1q_u8((const uint8_t *)(data + i));
+        uint8x16_t k1 = vld1q_u8((const uint8_t *)(tile + t));
+        t += 16; if (t >= tile_len) t = 0;
+        uint8x16_t d2 = vld1q_u8((const uint8_t *)(data + i + 16));
+        uint8x16_t k2 = vld1q_u8((const uint8_t *)(tile + t));
+        t += 16; if (t >= tile_len) t = 0;
+        vst1q_u8((uint8_t *)(data + i),      veorq_u8(d1, k1));
+        vst1q_u8((uint8_t *)(data + i + 16), veorq_u8(d2, k2));
+    }
+    for (; i + 16 <= len; i += 16) {
+        uint8x16_t d = vld1q_u8((const uint8_t *)(data + i));
+        uint8x16_t k = vld1q_u8((const uint8_t *)(tile + t));
+        vst1q_u8((uint8_t *)(data + i), veorq_u8(d, k));
+        t += 16;
+        if (t >= tile_len) t = 0;
+    }
+    for (; i < len; i++) {
+        data[i] ^= tile[t];
+        t++;
+        if (t >= tile_len) t = 0;
+    }
+#elif defined(HAVE_PPC_SPE)
+    int t = 0, i = 0;
+    /* Scalar head: align data pointer to 8 bytes for evldd */
+    int head = (8 - ((uintptr_t)data & 7)) & 7;
+    if (head > len) head = len;
+    for (; i < head; i++) {
+        data[i] ^= tile[t];
+        if (++t >= tile_len) t = 0;
+    }
+    int remaining = len - i;
+    if (remaining >= 8 && t != 0) {
+        /* Tile offset not 0 after head — rotate tile so SPE sees offset=0 */
+        char rtile[512 + 8];
+        assert(tile_len <= 512);
+        memcpy(rtile, tile + t, tile_len - t);
+        memcpy(rtile + (tile_len - t), tile, t);
+        memcpy(rtile + tile_len, rtile, 8); /* SPE evldd padding */
+        xor_tile_spe(data + i, remaining, rtile, tile_len);
+    } else if (remaining >= 8) {
+        /* Data aligned and tile offset 0 — call SPE directly */
+        xor_tile_spe(data + i, remaining, tile, tile_len);
+    } else {
+        /* Too short for SPE — scalar tail */
+        for (; i < len; i++) {
+            data[i] ^= tile[t];
+            if (++t >= tile_len) t = 0;
+        }
+    }
+#else
+    /* Word-width XOR for generic platforms (MIPS, RISC-V, PPC, i486, ARMv7).
+     * sizeof(unsigned long) = 4 on 32-bit, 8 on 64-bit. */
+    int t = 0, i = 0;
+    for (; i + COOK_VEC_WIDTH <= len; i += COOK_VEC_WIDTH) {
+        unsigned long d, k;
+        memcpy(&d, data + i, sizeof(d));
+        memcpy(&k, tile + t, sizeof(k));
+        d ^= k;
+        memcpy(data + i, &d, sizeof(d));
+        t += COOK_VEC_WIDTH;
+        if (t >= tile_len) t = 0;
+    }
+    for (; i < len; i++) {
+        data[i] ^= tile[t];
+        if (++t >= tile_len) t = 0;
+    }
+#endif
+}
+
+/*
+ * Expand pattern into a SIMD-aligned tile on the stack and XOR.
+ * Used for obscure IV (4-32 bytes, changes per packet).
+ */
+static void
+xor_with_pattern(char *data, int len, const char *pat, int pat_len)
+{
+    if (pat_len <= 0 || len <= 0) return;
+    int tile_len = cook_lcm(pat_len, COOK_VEC_WIDTH);
+    /* Extra COOK_VEC_WIDTH bytes: when SPE evldd reads 8 bytes at a
+     * non-zero tile offset, the load may straddle the tile boundary.
+     * Padding with a copy of the tile start makes this safe. */
+    char tile[512 + COOK_VEC_WIDTH];
+    assert(tile_len <= 512);
+    expand_tile(tile, tile_len, pat, pat_len);
+    memcpy(tile + tile_len, tile, COOK_VEC_WIDTH);
+    xor_tile(data, len, tile, tile_len);
+}
+
+/* --- Key preparation ---------------------------------------------------- */
+
+void
+cook_ctx_prepare_key(cook_ctx_t *ctx)
+{
+    ctx->key_len = (int)strlen(ctx->key);
+    if (ctx->key_len == 0) {
+        ctx->key_tile_len = 0;
+        return;
+    }
+    ctx->key_tile_len = cook_lcm(ctx->key_len, COOK_VEC_WIDTH);
+    assert(ctx->key_tile_len + COOK_VEC_WIDTH <= (int)sizeof(ctx->key_tile));
+    expand_tile(ctx->key_tile, ctx->key_tile_len, ctx->key, ctx->key_len);
+    memcpy(ctx->key_tile + ctx->key_tile_len, ctx->key_tile, COOK_VEC_WIDTH);
+}
+
+/* --- Cook operations ---------------------------------------------------- */
+
+static void
+encrypt_0(cook_ctx_t *ctx, char *input, int len)
+{
+    if (ctx->key_tile_len == 0) return;
+    xor_tile(input, len, ctx->key_tile, ctx->key_tile_len);
+}
+
+static int
+do_obscure(cook_ctx_t *ctx, char *data, int &len)
+{
+    assert(len >= 0);
+    assert(len < cook_buf_len);
+
+    int iv_len = random_between(ctx->iv_min, ctx->iv_max);
+    get_fake_random_chars(data + len, iv_len);
+    data[iv_len + len] = (uint8_t)iv_len;
+    xor_with_pattern(data, len, data + len, iv_len);
+
+    len = len + iv_len + 1;
+    return 0;
+}
+
+static int
+de_obscure(cook_ctx_t *ctx, char *data, int &len)
+{
+    if (len < 1) return -1;
+    int iv_len = int((uint8_t)data[len - 1]);
+
+    if (iv_len < ctx->iv_min || iv_len > ctx->iv_max) return -1;
+    if (len < 1 + iv_len) return -1;
+
+    len = len - 1 - iv_len;
+    xor_with_pattern(data, len, data + len, iv_len);
+
+    return 0;
+}
+
+static int
+put_crc32(cook_ctx_t *ctx, char *s, int &len)
+{
+    if (ctx->disable_checksum) return 0;
+    assert(len >= 0);
+    uint32_t crc = (uint32_t)crc32c(s, len);
+    cook_write_u32(s + len, crc);
+    len += (int)sizeof(uint32_t);
+    return 0;
+}
+
+static int
+rm_crc32(cook_ctx_t *ctx, char *s, int &len)
+{
+    if (ctx->disable_checksum) return 0;
+    assert(len >= 0);
+    len -= (int)sizeof(uint32_t);
+    if (len < 0) return -1;
+    uint32_t crc_in = cook_read_u32(s + len);
+    uint32_t crc = (uint32_t)crc32c(s, len);
+    if (crc != crc_in) return -1;
+    return 0;
+}
+
+int
+do_cook(cook_ctx_t *ctx, char *data, int &len)
+{
+    put_crc32(ctx, data, len);
+    if (!ctx->disable_obscure) do_obscure(ctx, data, len);
+    if (!ctx->disable_xor) encrypt_0(ctx, data, len);
+    return 0;
+}
+
+int
+de_cook(cook_ctx_t *ctx, char *data, int &len)
+{
+    if (!ctx->disable_xor) encrypt_0(ctx, data, len);
+    if (!ctx->disable_obscure) {
+        if (de_obscure(ctx, data, len) != 0)
+            return -1;
+    }
+    if (rm_crc32(ctx, data, len) != 0)
+        return -2;
+    return 0;
+}
+
+#ifdef BENCH_EXPOSE_INTERNALS
+void bench_xor_tile(char *data, int len, const char *tile, int tile_len) {
+    xor_tile(data, len, tile, tile_len);
+}
+int bench_cook_vec_width() {
+    return COOK_VEC_WIDTH;
+}
+const char *bench_xor_tile_impl() {
+    /* Trigger detection if not yet run */
+    char dummy[16] = {}, tile[16] = {};
+    xor_tile(dummy, 1, tile, 16);
+#if defined(__x86_64__) || defined(_M_X64)
+    if (xor_simd_tier >= 2) return "avx512bw";
+    if (xor_simd_tier >= 1) return "avx2";
+    return "sse2";
+#elif defined(__aarch64__)
+    return "neon";
+#elif defined(HAVE_PPC_SPE)
+    return "spe";
+#else
+    return "scalar";
+#endif
+}
+#endif
diff --git a/packet_cook.h b/packet_cook.h
new file mode 100644
index 0000000..6a3a328
--- /dev/null
+++ b/packet_cook.h
@@ -0,0 +1,20 @@
+#ifndef PACKET_COOK_H_
+#define PACKET_COOK_H_
+
+struct cook_ctx_t {
+    char key[1000];
+    int key_len;              /* cached strlen(key), set by cook_ctx_prepare_key */
+    int key_tile_len;         /* lcm(key_len, vec_width), 0 if no key */
+    char key_tile[16000];     /* key repeated to tile_len for SIMD XOR */
+    int iv_min;
+    int iv_max;
+    int disable_checksum;
+    int disable_obscure;
+    int disable_xor;
+};
+
+void cook_ctx_prepare_key(cook_ctx_t *ctx);
+int do_cook(cook_ctx_t *ctx, char *data, int &len);
+int de_cook(cook_ctx_t *ctx, char *data, int &len);
+
+#endif /* PACKET_COOK_H_ */
diff --git a/tunnel_client.cpp b/tunnel_client.cpp
index 83ccf05..477ef4d 100644
--- a/tunnel_client.cpp
+++ b/tunnel_client.cpp
@@ -1,11 +1,9 @@
 #include "tunnel.h"
+#include "io_uring_recv.h"
 
-void data_from_local_or_fec_timeout(conn_info_t &conn_info, int is_time_out) {
+static void client_process_local_packet(conn_info_t &conn_info, char *data, int data_len,
+                                         struct sockaddr *src_addr, socklen_t src_addr_len) {
     fd64_t &remote_fd64 = conn_info.remote_fd64;
-    int &local_listen_fd = conn_info.local_listen_fd;
-
-    char data[buf_len];
-    int data_len;
     address_t addr;
     u32_t conv;
     int out_n;
@@ -17,126 +15,62 @@ void data_from_local_or_fec_timeout(conn_info_t &conn_info, int is_time_out) {
     dest.inner.fd64 = remote_fd64;
     dest.cook = 1;
 
-    if (is_time_out) {
-        // fd64_t fd64=events[idx].data.u64;
-        mylog(log_trace, "events[idx].data.u64 == conn_info.fec_encode_manager.get_timer_fd64()\n");
-
-        // uint64_t value;
-        // if(!fd_manager.exist(fd64))   //fd64 has been closed
-        //{
-        //	mylog(log_trace,"!fd_manager.exist(fd64)");
-        //	continue;
-        // }
-        // if((ret=read(fd_manager.to_fd(fd64), &value, 8))!=8)
-        //{
-        //	mylog(log_trace,"(ret=read(fd_manager.to_fd(fd64), &value, 8))!=8,ret=%d\n",ret);
-        //	continue;
-        // }
-        // if(value==0)
-        //{
-        //	mylog(log_debug,"value==0\n");
-        //	continue;
-        // }
-        // assert(value==1);
-        from_normal_to_fec(conn_info, 0, 0, out_n, out_arr, out_len, out_delay);
-    } else  // events[idx].data.u64 == (u64_t)local_listen_fd
-    {
-        mylog(log_trace, "events[idx].data.u64 == (u64_t)local_listen_fd\n");
-        address_t::storage_t udp_new_addr_in = {0};
-        socklen_t udp_new_addr_len = sizeof(address_t::storage_t);
-        if ((data_len = recvfrom(local_listen_fd, data, max_data_len + 1, 0,
-                                 (struct sockaddr *)&udp_new_addr_in, &udp_new_addr_len)) == -1) {
-            mylog(log_debug, "recv_from error,this shouldnt happen,err=%s,but we can try to continue\n", get_sock_error());
-            return;
-        };
-
-        if (data_len == max_data_len + 1) {
-            mylog(log_warn, "huge packet from upper level, data_len > %d, packet truncated, dropped\n", max_data_len);
-            return;
-        }
-
-        if (!disable_mtu_warn && data_len >= mtu_warn) {
-            mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn);
-        }
+    if (data_len == max_data_len + 1) {
+        mylog(log_warn, "huge packet from upper level, data_len > %d, packet truncated, dropped\n", max_data_len);
+        return;
+    }
 
-        addr.from_sockaddr((struct sockaddr *)&udp_new_addr_in, udp_new_addr_len);
+    if (!disable_mtu_warn && data_len >= mtu_warn) {
+        mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn);
+    }
 
-        mylog(log_trace, "Received packet from %s, len: %d\n", addr.get_str(), data_len);
+    addr.from_sockaddr(src_addr, src_addr_len);
 
-        // u64_t u64=ip_port.to_u64();
+    mylog(log_trace, "Received packet from %s, len: %d\n", addr.get_str(), data_len);
 
-        if (!conn_info.conv_manager.c.is_data_used(addr)) {
-            if (conn_info.conv_manager.c.get_size() >= max_conv_num) {
-                mylog(log_warn, "ignored new udp connect bc max_conv_num exceed\n");
-                return;
-            }
-            conv = conn_info.conv_manager.c.get_new_conv();
-            conn_info.conv_manager.c.insert_conv(conv, addr);
-            mylog(log_info, "new packet from %s,conv_id=%x\n", addr.get_str(), conv);
-        } else {
-            conv = conn_info.conv_manager.c.find_conv_by_data(addr);
-            mylog(log_trace, "conv=%d\n", conv);
+    if (!conn_info.conv_manager.c.is_data_used(addr)) {
+        if (conn_info.conv_manager.c.get_size() >= max_conv_num) {
+            mylog(log_warn, "ignored new udp connect bc max_conv_num exceed\n");
+            return;
         }
-        conn_info.conv_manager.c.update_active_time(conv);
-        char *new_data;
-        int new_len;
-        put_conv(conv, data, data_len, new_data, new_len);
-
-        mylog(log_trace, "data_len=%d new_len=%d\n", data_len, new_len);
-        from_normal_to_fec(conn_info, new_data, new_len, out_n, out_arr, out_len, out_delay);
+        conv = conn_info.conv_manager.c.get_new_conv();
+        conn_info.conv_manager.c.insert_conv(conv, addr);
+        mylog(log_info, "new packet from %s,conv_id=%x\n", addr.get_str(), conv);
+    } else {
+        conv = conn_info.conv_manager.c.find_conv_by_data(addr);
+        mylog(log_trace, "conv=%d\n", conv);
     }
-    mylog(log_trace, "out_n=%d\n", out_n);
-    for (int i = 0; i < out_n; i++) {
-        delay_send(out_delay[i], dest, out_arr[i], out_len[i]);
-    }
-}
-static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
-    assert(!(revents & EV_ERROR));
+    conn_info.conv_manager.c.update_active_time(conv);
+    int new_len;
+    put_conv_inplace(conv, data, data_len, new_len);
 
-    conn_info_t &conn_info = *((conn_info_t *)watcher->data);
+    mylog(log_trace, "data_len=%d new_len=%d\n", data_len, new_len);
+    from_normal_to_fec(conn_info, data, new_len, out_n, out_arr, out_len, out_delay);
 
-    data_from_local_or_fec_timeout(conn_info, 0);
+    mylog(log_trace, "out_n=%d\n", out_n);
+    delay_send_batch(out_n, out_delay, dest, out_arr, out_len);
 }
 
-static void remote_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
-    assert(!(revents & EV_ERROR));
-
-    conn_info_t &conn_info = *((conn_info_t *)watcher->data);
-
-    char data[buf_len];
-    if (!fd_manager.exist(watcher->u64))  // fd64 has been closed
-    {
-        mylog(log_trace, "!fd_manager.exist(events[idx].data.u64)");
-        return;
-    }
-    fd64_t &remote_fd64 = conn_info.remote_fd64;
-    int &remote_fd = conn_info.remote_fd;
-
-    assert(watcher->u64 == remote_fd64);
-
-    int fd = fd_manager.to_fd(remote_fd64);
-
-    int data_len = recv(fd, data, max_data_len + 1, 0);
-
+static void client_process_remote_packet(conn_info_t &conn_info, char *data, int data_len) {
     if (data_len == max_data_len + 1) {
         mylog(log_warn, "huge packet, data_len > %d, packet truncated, dropped\n", max_data_len);
         return;
     }
 
-    mylog(log_trace, "received data from udp fd %d, len=%d\n", remote_fd, data_len);
+    mylog(log_trace, "received data from remote, len=%d\n", data_len);
     if (data_len < 0) {
         if (get_sock_errno() == ECONNREFUSED) {
-            mylog(log_debug, "recv failed %d ,udp_fd%d,errno:%s\n", data_len, remote_fd, get_sock_error());
+            mylog(log_debug, "recv failed %d ,errno:%s\n", data_len, get_sock_error());
         }
 
-        mylog(log_warn, "recv failed %d ,udp_fd%d,errno:%s\n", data_len, remote_fd, get_sock_error());
+        mylog(log_warn, "recv failed %d ,errno:%s\n", data_len, get_sock_error());
         return;
     }
     if (!disable_mtu_warn && data_len > mtu_warn) {
         mylog(log_warn, "huge packet,data len=%d (>%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn);
     }
 
-    if (de_cook(data, data_len) != 0) {
+    if (de_cook(&cook_ctx, data, data_len) != 0) {
         mylog(log_debug, "de_cook error");
         return;
     }
@@ -174,6 +108,66 @@ static void remote_cb(struct ev_loop *loop, struct ev_io *watcher, int revents)
     }
 }
 
+void data_from_local_or_fec_timeout(conn_info_t &conn_info, int is_time_out) {
+    fd64_t &remote_fd64 = conn_info.remote_fd64;
+    int &local_listen_fd = conn_info.local_listen_fd;
+    int out_n;
+    char **out_arr;
+    int *out_len;
+    my_time_t *out_delay;
+    dest_t dest;
+    dest.type = type_fd64;
+    dest.inner.fd64 = remote_fd64;
+    dest.cook = 1;
+
+    if (is_time_out) {
+        mylog(log_trace, "events[idx].data.u64 == conn_info.fec_encode_manager.get_timer_fd64()\n");
+        from_normal_to_fec(conn_info, 0, 0, out_n, out_arr, out_len, out_delay);
+        mylog(log_trace, "out_n=%d\n", out_n);
+        delay_send_batch(out_n, out_delay, dest, out_arr, out_len);
+    } else {
+        /* Single-packet path (fallback) */
+        char data[buf_len];
+        int data_len;
+        address_t::storage_t udp_new_addr_in = {0};
+        socklen_t udp_new_addr_len = sizeof(address_t::storage_t);
+        if ((data_len = recvfrom(local_listen_fd, data + sizeof(u32_t), max_data_len + 1, 0,
+                                 (struct sockaddr *)&udp_new_addr_in, &udp_new_addr_len)) == -1) {
+            mylog(log_debug, "recv_from error,this shouldnt happen,err=%s,but we can try to continue\n", get_sock_error());
+            return;
+        };
+        client_process_local_packet(conn_info, data, data_len,
+                                     (struct sockaddr *)&udp_new_addr_in, udp_new_addr_len);
+    }
+}
+static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
+    assert(!(revents & EV_ERROR));
+
+    conn_info_t &conn_info = *((conn_info_t *)watcher->data);
+
+    data_from_local_or_fec_timeout(conn_info, 0);
+}
+
+static void remote_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
+    assert(!(revents & EV_ERROR));
+
+    conn_info_t &conn_info = *((conn_info_t *)watcher->data);
+
+    if (!fd_manager.exist(watcher->u64))  // fd64 has been closed
+    {
+        mylog(log_trace, "!fd_manager.exist(events[idx].data.u64)");
+        return;
+    }
+    fd64_t &remote_fd64 = conn_info.remote_fd64;
+    assert(watcher->u64 == remote_fd64);
+
+    int fd = fd_manager.to_fd(remote_fd64);
+
+    char data[buf_len];
+    int data_len = recv(fd, data, max_data_len + 1, 0);
+    client_process_remote_packet(conn_info, data, data_len);
+}
+
 static void fifo_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
     assert(!(revents & EV_ERROR));
     int fifo_fd = watcher->fd;
@@ -229,18 +223,98 @@ static void conn_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int re
         dest.inner.fd64 = conn_info.remote_fd64;
         dest.cook = 1;
         from_normal_to_fec(conn_info, 0, 0, out_n, out_arr, out_len, out_delay);
-        for (int i = 0; i < out_n; i++) {
-            delay_send(out_delay[i], dest, out_arr[i], out_len[i]);
-        }
+        delay_send_batch(out_n, out_delay, dest, out_arr, out_len);
     }
 }
 
+#ifdef __linux__
+static uring_ctx_t client_uring_ctx;
+static conn_info_t *client_uring_conn_info;
+static void client_uring_drain(struct ev_loop *loop);
+#endif
+
 static void prepare_cb(struct ev_loop *loop, struct ev_prepare *watcher, int revents) {
     assert(!(revents & EV_ERROR));
 
     delay_manager.check();
 }
 
+
+#ifdef __linux__
+
+static void client_uring_drain(struct ev_loop *loop) {
+    conn_info_t &conn_info = *client_uring_conn_info;
+    uring_ctx_t *ctx = &client_uring_ctx;
+
+    for (;;) {
+        unsigned ready = uring_cq_ready(ctx);
+        if (ready == 0)
+            break;
+
+        int need_submit = 0;
+
+        for (unsigned i = 0; i < ready; i++) {
+            struct io_uring_cqe *cqe = uring_cqe_at(ctx, i);
+            uint8_t type = uring_tag_type(cqe->user_data);
+            int more = cqe->flags & IORING_CQE_F_MORE;
+
+            if (cqe->res < 0) {
+                if (!more && cqe->res != -ECANCELED) {
+                    if (type == URING_TAG_CLIENT_LOCAL)
+                        uring_add_multishot_recvmsg(ctx, conn_info.local_listen_fd, cqe->user_data);
+                    else if (type == URING_TAG_CLIENT_REMOTE)
+                        uring_add_multishot_recv(ctx, fd_manager.to_fd(conn_info.remote_fd64), cqe->user_data);
+                    need_submit = 1;
+                }
+                continue;
+            }
+
+            if (type == URING_TAG_CLIENT_LOCAL) {
+                uring_recv_buf_t recv_buf;
+                if (uring_parse_recvmsg_cqe(ctx, cqe, &recv_buf) == 0) {
+                    /* Zero-copy: recvmsg has 140+ bytes of headroom before payload;
+                       use sizeof(u32_t) of it for in-place conv header insertion. */
+                    char *data = recv_buf.data - sizeof(u32_t);
+                    int data_len = recv_buf.len < (int)(buf_len - sizeof(u32_t)) ? recv_buf.len : (int)(buf_len - sizeof(u32_t));
+                    client_process_local_packet(conn_info, data, data_len,
+                                                 (struct sockaddr *)&recv_buf.addr, recv_buf.addr_len);
+                    uring_recycle_buf(ctx, recv_buf.buf_id);
+                }
+            } else if (type == URING_TAG_CLIENT_REMOTE) {
+                uring_recv_buf_t recv_buf;
+                if (uring_parse_recv_cqe(ctx, cqe, &recv_buf) == 0) {
+                    client_process_remote_packet(conn_info, recv_buf.data, recv_buf.len);
+                    uring_recycle_buf(ctx, recv_buf.buf_id);
+                }
+            }
+
+            if (!more) {
+                if (type == URING_TAG_CLIENT_LOCAL)
+                    uring_add_multishot_recvmsg(ctx, conn_info.local_listen_fd, cqe->user_data);
+                else if (type == URING_TAG_CLIENT_REMOTE)
+                    uring_add_multishot_recv(ctx, fd_manager.to_fd(conn_info.remote_fd64), cqe->user_data);
+                need_submit = 1;
+            }
+        }
+
+        /* Single batched advance + buffer commit */
+        uring_cq_advance(ctx, ready);
+        uring_buf_ring_commit(ctx);
+
+        /* Submit any re-arms and flush deferred completions in one syscall */
+        if (need_submit)
+            uring_submit_and_flush(ctx);
+        else
+            uring_flush(ctx);
+    }
+}
+
+static void client_uring_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
+    assert(!(revents & EV_ERROR));
+    client_uring_drain(loop);
+}
+#endif
+
 int tunnel_client_event_loop() {
     int i, j, k;
     int ret;
@@ -268,19 +342,6 @@ int tunnel_client_event_loop() {
 
     conn_info.loop = loop;
 
-    // ev.events = EPOLLIN;
-    // ev.data.u64 = local_listen_fd;
-    // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, local_listen_fd, &ev);
-    // if (ret!=0) {
-    //	mylog(log_fatal,"add  udp_listen_fd error\n");
-    //	myexit(-1);
-    // }
-    struct ev_io local_listen_watcher;
-    local_listen_watcher.data = &conn_info;
-
-    ev_io_init(&local_listen_watcher, local_listen_cb, local_listen_fd, EV_READ);
-    ev_io_start(loop, &local_listen_watcher);
-
     int &remote_fd = conn_info.remote_fd;
     fd64_t &remote_fd64 = conn_info.remote_fd64;
 
@@ -289,21 +350,37 @@ int tunnel_client_event_loop() {
 
     mylog(log_debug, "remote_fd64=%llu\n", remote_fd64);
 
-    // ev.events = EPOLLIN;
-    // ev.data.u64 = remote_fd64;
+    int use_uring = 0;
+#ifdef __linux__
+    if (uring_init(&client_uring_ctx, 64, 256, buf_len) == 0) {
+        g_uring_ctx = &client_uring_ctx;
+        client_uring_conn_info = &conn_info;
+        static struct ev_io uring_watcher;
+        ev_io_init(&uring_watcher, client_uring_cb, client_uring_ctx.ring_fd, EV_READ);
+        ev_io_start(loop, &uring_watcher);
+
+        uring_add_multishot_recvmsg(&client_uring_ctx, local_listen_fd,
+                                      uring_tag(URING_TAG_CLIENT_LOCAL, 0));
+        uring_add_multishot_recv(&client_uring_ctx, remote_fd,
+                                   uring_tag(URING_TAG_CLIENT_REMOTE, 0));
+        uring_submit(&client_uring_ctx);
+        use_uring = 1;
+        mylog(log_info, "io_uring: active for client sockets\n");
+    }
+#endif
 
-    // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, remote_fd, &ev);
-    // if (ret!= 0) {
-    //	mylog(log_fatal,"add raw_fd error\n");
-    //	myexit(-1);
-    // }
+    struct ev_io local_listen_watcher;
+    local_listen_watcher.data = &conn_info;
+    ev_io_init(&local_listen_watcher, local_listen_cb, local_listen_fd, EV_READ);
+    if (!use_uring)
+        ev_io_start(loop, &local_listen_watcher);
 
     struct ev_io remote_watcher;
     remote_watcher.data = &conn_info;
     remote_watcher.u64 = remote_fd64;
-
     ev_io_init(&remote_watcher, remote_cb, remote_fd, EV_READ);
-    ev_io_start(loop, &remote_watcher);
+    if (!use_uring)
+        ev_io_start(loop, &remote_watcher);
 
     // ev.events = EPOLLIN;
     // ev.data.u64 = delay_manager.get_timer_fd();
diff --git a/tunnel_server.cpp b/tunnel_server.cpp
index aaa82c7..79e2be7 100644
--- a/tunnel_server.cpp
+++ b/tunnel_server.cpp
@@ -6,6 +6,7 @@
  */
 
 #include "tunnel.h"
+#include "io_uring_recv.h"
 
 static void conn_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int revents);
 static void fec_encode_cb(struct ev_loop *loop, struct ev_timer *watcher, int revents);
@@ -15,12 +16,57 @@ enum tmp_mode_t { is_from_remote = 0,
                   is_fec_timeout,
                   is_conn_timer };
 
+static void server_process_remote_packet(conn_info_t &conn_info, fd64_t fd64, char *data, int data_len) {
+    /* Pre-condition: fd_manager.exist(fd64), data received at data + sizeof(u32_t) */
+    assert(conn_info.conv_manager.s.is_data_used(fd64));
+
+    u32_t conv = conn_info.conv_manager.s.find_conv_by_data(fd64);
+    conn_info.conv_manager.s.update_active_time(conv);
+    conn_info.update_active_time();
+
+    if (data_len == max_data_len + 1) {
+        mylog(log_warn, "huge packet from upper level, data_len > %d, packet truncated, dropped\n", max_data_len);
+        return;
+    }
+
+    mylog(log_trace, "received a packet from udp_fd,len:%d,conv=%d\n", data_len, conv);
+
+    if (data_len < 0) {
+        mylog(log_debug, "udp fd,recv_len<0 continue,%s\n", get_sock_error());
+        return;
+    }
+
+    if (!disable_mtu_warn && data_len >= mtu_warn) {
+        mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn);
+    }
+
+    int new_len;
+    put_conv_inplace(conv, data, data_len, new_len);
+
+    address_t &addr = conn_info.addr;
+    int &local_listen_fd = conn_info.local_listen_fd;
+
+    int out_n;
+    char **out_arr;
+    int *out_len;
+    my_time_t *out_delay;
+    dest_t dest;
+    dest.inner.fd_addr.fd = local_listen_fd;
+    dest.inner.fd_addr.addr = addr;
+    dest.type = type_fd_addr;
+    dest.cook = 1;
+
+    from_normal_to_fec(conn_info, data, new_len, out_n, out_arr, out_len, out_delay);
+
+    mylog(log_trace, "out_n=%d\n", out_n);
+    delay_send_batch(out_n, out_delay, dest, out_arr, out_len);
+}
+
 void data_from_remote_or_fec_timeout_or_conn_timer(conn_info_t &conn_info, fd64_t fd64, tmp_mode_t mode) {
     int ret;
 
     char data[buf_len];
     int data_len;
-    u32_t conv;
     // fd64_t fd64=events[idx].data.u64;
     // mylog(log_trace,"events[idx].data.u64 >u32_t(-1),%llu\n",(u64_t)events[idx].data.u64);
 
@@ -77,81 +123,39 @@ void data_from_remote_or_fec_timeout_or_conn_timer(conn_info_t &conn_info, fd64_
             return;
         }
 
-        // fd64_t &fd64 =conn_info.remote_fd64;
-        assert(conn_info.conv_manager.s.is_data_used(fd64));
-
-        conv = conn_info.conv_manager.s.find_conv_by_data(fd64);
-        conn_info.conv_manager.s.update_active_time(conv);
-        conn_info.update_active_time();
-
         int fd = fd_manager.to_fd(fd64);
-        data_len = recv(fd, data, max_data_len + 1, 0);
-
-        if (data_len == max_data_len + 1) {
-            mylog(log_warn, "huge packet from upper level, data_len > %d, packet truncated, dropped\n", max_data_len);
-            return;
-        }
-
-        mylog(log_trace, "received a packet from udp_fd,len:%d,conv=%d\n", data_len, conv);
-
-        if (data_len < 0) {
-            mylog(log_debug, "udp fd,recv_len<0 continue,%s\n", get_sock_error());
-
-            return;
-        }
-
-        if (!disable_mtu_warn && data_len >= mtu_warn) {
-            mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn);
-        }
-
-        char *new_data;
-        int new_len;
-        put_conv(conv, data, data_len, new_data, new_len);
-
-        from_normal_to_fec(conn_info, new_data, new_len, out_n, out_arr, out_len, out_delay);
+        /* Receive with sizeof(u32_t) headroom for in-place conv header */
+        data_len = recv(fd, data + sizeof(u32_t), max_data_len + 1, 0);
+        server_process_remote_packet(conn_info, fd64, data, data_len);
+        return;
     } else {
         assert(0 == 1);
     }
 
     mylog(log_trace, "out_n=%d\n", out_n);
-    for (int i = 0; i < out_n; i++) {
-        delay_send(out_delay[i], dest, out_arr[i], out_len[i]);
-    }
+    delay_send_batch(out_n, out_delay, dest, out_arr, out_len);
 }
 
-static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
-    assert(!(revents & EV_ERROR));
-
-    int local_listen_fd = watcher->fd;
+static void server_process_tunnel_packet(struct ev_loop *loop, int local_listen_fd,
+                                          char *data, int data_len,
+                                          struct sockaddr *src_addr, socklen_t src_addr_len) {
     int ret;
 
-    mylog(log_trace, "events[idx].data.u64 == (u64_t)local_listen_fd\n");
-    char data[buf_len];
-    int data_len;
-    address_t::storage_t udp_new_addr_in = {0};
-    socklen_t udp_new_addr_len = sizeof(address_t::storage_t);
-    if ((data_len = recvfrom(local_listen_fd, data, max_data_len + 1, 0,
-                             (struct sockaddr *)&udp_new_addr_in, &udp_new_addr_len)) == -1) {
-        mylog(log_error, "recv_from error,this shouldnt happen,err=%s,but we can try to continue\n", get_sock_error());
-        return;
-    };
-
     if (data_len == max_data_len + 1) {
         mylog(log_warn, "huge packet, data_len > %d, packet truncated, dropped\n", max_data_len);
         return;
     }
 
     address_t addr;
-    addr.from_sockaddr((struct sockaddr *)&udp_new_addr_in, udp_new_addr_len);
+    addr.from_sockaddr(src_addr, src_addr_len);
 
     mylog(log_trace, "Received packet from %s,len: %d\n", addr.get_str(), data_len);
 
-    if (!disable_mtu_warn && data_len >= mtu_warn)  ///////////////////////delete this for type 0 in furture
-    {
+    if (!disable_mtu_warn && data_len >= mtu_warn) {
         mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn);
     }
 
-    if (de_cook(data, data_len) != 0) {
+    if (de_cook(&cook_ctx, data, data_len) != 0) {
         mylog(log_debug, "de_cook error");
         return;
     }
@@ -162,33 +166,16 @@ static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int rev
             return;
         }
 
-        // conn_manager.insert(addr);
         conn_info_t &conn_info = conn_manager.find_insert(addr);
         conn_info.addr = addr;
         conn_info.loop = ev_default_loop(0);
         conn_info.local_listen_fd = local_listen_fd;
 
-        // u64_t fec_fd64=conn_info.fec_encode_manager.get_timer_fd64();
-        // mylog(log_debug,"fec_fd64=%llu\n",fec_fd64);
-        // ev.events = EPOLLIN;
-        // ev.data.u64 = fec_fd64;
-        // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd_manager.to_fd(fec_fd64), &ev);
-
-        // fd_manager.get_info(fec_fd64).ip_port=ip_port;
-
         conn_info.timer.data = &conn_info;
         ev_init(&conn_info.timer, conn_timer_cb);
         ev_timer_set(&conn_info.timer, 0, timer_interval / 1000.0);
         ev_timer_start(loop, &conn_info.timer);
 
-        // conn_info.timer.add_fd64_to_epoll(epoll_fd);
-        // conn_info.timer.set_timer_repeat_us(timer_interval*1000);
-
-        // mylog(log_debug,"conn_info.timer.get_timer_fd64()=%llu\n",conn_info.timer.get_timer_fd64());
-
-        // u64_t timer_fd64=conn_info.timer.get_timer_fd64();
-        // fd_manager.get_info(timer_fd64).ip_port=ip_port;
-
         conn_info.fec_encode_manager.set_data(&conn_info);
         conn_info.fec_encode_manager.set_loop_and_cb(loop, fec_encode_cb);
 
@@ -228,20 +215,26 @@ static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int rev
             }
 
             fd64_t fd64 = fd_manager.create(new_udp_fd);
-            // ev.events = EPOLLIN;
-            // ev.data.u64 = fd64;
-            // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, new_udp_fd, &ev);
 
             conn_info.conv_manager.s.insert_conv(conv, fd64);
             fd_manager.get_info(fd64).addr = addr;
 
-            ev_io &io_watcher = fd_manager.get_info(fd64).io_watcher;
-            io_watcher.u64 = fd64;
-            io_watcher.data = &conn_info;
+#ifdef __linux__
+            if (g_uring_ctx && g_uring_ctx->available) {
+                uring_add_multishot_recv(g_uring_ctx, new_udp_fd,
+                                          uring_tag(URING_TAG_SERVER_REMOTE, fd64));
+                uring_submit(g_uring_ctx);
+            } else
+#endif
+            {
+                ev_io &io_watcher = fd_manager.get_info(fd64).io_watcher;
+                io_watcher.u64 = fd64;
+                io_watcher.data = &conn_info;
 
-            ev_init(&io_watcher, remote_cb);
-            ev_io_set(&io_watcher, new_udp_fd, EV_READ);
-            ev_io_start(conn_info.loop, &io_watcher);
+                ev_init(&io_watcher, remote_cb);
+                ev_io_set(&io_watcher, new_udp_fd, EV_READ);
+                ev_io_start(conn_info.loop, &io_watcher);
+            }
 
             mylog(log_info, "[%s]new conv %x,fd %d created,fd64=%llu\n", addr.get_str(), conv, new_udp_fd, fd64);
         }
@@ -254,6 +247,26 @@ static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int rev
     }
 }
 
+static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
+    assert(!(revents & EV_ERROR));
+
+    int local_listen_fd = watcher->fd;
+
+    char data[buf_len];
+    int data_len;
+    address_t::storage_t udp_new_addr_in = {0};
+    socklen_t udp_new_addr_len = sizeof(address_t::storage_t);
+    data_len = recvfrom(local_listen_fd, data, max_data_len + 1, 0,
+                        (struct sockaddr *)&udp_new_addr_in, &udp_new_addr_len);
+    if (data_len < 0) {
+        mylog(log_error, "recv_from error,err=%s\n", get_sock_error());
+        return;
+    }
+
+    server_process_tunnel_packet(loop, local_listen_fd, data, data_len,
+                                  (struct sockaddr *)&udp_new_addr_in, udp_new_addr_len);
+}
+
 static void remote_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
     assert(!(revents & EV_ERROR));
 
@@ -304,12 +317,17 @@ static void conn_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int re
     data_from_remote_or_fec_timeout_or_conn_timer(conn_info, 0, is_conn_timer);
 }
 
+static void server_uring_drain(struct ev_loop *loop);
+
 static void prepare_cb(struct ev_loop *loop, struct ev_prepare *watcher, int revents) {
     assert(!(revents & EV_ERROR));
 
     delay_manager.check();
 }
 
+#ifdef __linux__
+#endif
+
 static void global_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int revents) {
     assert(!(revents & EV_ERROR));
 
@@ -319,6 +337,100 @@ static void global_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int
     mylog(log_trace, "events[idx].data.u64==(u64_t)timer.get_timer_fd()\n");
 }
 
+#ifdef __linux__
+static uring_ctx_t server_uring_ctx;
+static int server_local_listen_fd;
+
+static void server_uring_drain(struct ev_loop *loop) {
+    uring_ctx_t *ctx = &server_uring_ctx;
+    int local_listen_fd = server_local_listen_fd;
+
+    for (;;) {
+        unsigned ready = uring_cq_ready(ctx);
+        if (ready == 0)
+            break;
+
+        int need_submit = 0;
+
+        for (unsigned i = 0; i < ready; i++) {
+            struct io_uring_cqe *cqe = uring_cqe_at(ctx, i);
+            uint8_t type = uring_tag_type(cqe->user_data);
+            int more = cqe->flags & IORING_CQE_F_MORE;
+
+            if (cqe->res < 0) {
+                if (!more && cqe->res != -ECANCELED) {
+                    if (type == URING_TAG_SERVER_LOCAL) {
+                        uring_add_multishot_recvmsg(ctx, local_listen_fd, cqe->user_data);
+                        need_submit = 1;
+                    } else if (type == URING_TAG_SERVER_REMOTE) {
+                        fd64_t fd64 = (fd64_t)uring_tag_payload(cqe->user_data);
+                        if (fd_manager.exist(fd64)) {
+                            uring_add_multishot_recv(ctx, fd_manager.to_fd(fd64), cqe->user_data);
+                            need_submit = 1;
+                        }
+                    }
+                }
+                continue;
+            }
+
+            if (type == URING_TAG_SERVER_LOCAL) {
+                uring_recv_buf_t recv_buf;
+                if (uring_parse_recvmsg_cqe(ctx, cqe, &recv_buf) == 0) {
+                    server_process_tunnel_packet(loop, local_listen_fd, recv_buf.data, recv_buf.len,
+                                                  (struct sockaddr *)&recv_buf.addr, recv_buf.addr_len);
+                    uring_recycle_buf(ctx, recv_buf.buf_id);
+                }
+            } else if (type == URING_TAG_SERVER_REMOTE) {
+                fd64_t fd64 = (fd64_t)uring_tag_payload(cqe->user_data);
+                uring_recv_buf_t recv_buf;
+                if (uring_parse_recv_cqe(ctx, cqe, &recv_buf) == 0) {
+                    if (fd_manager.exist(fd64)) {
+                        address_t &addr = fd_manager.get_info(fd64).addr;
+                        if (conn_manager.exist(addr)) {
+                            conn_info_t &conn_info = conn_manager.find_insert(addr);
+                            /* Zero-copy: URING_RECV_HEADROOM bytes before data
+                               reserved for in-place conv header insertion. */
+                            char *data = recv_buf.data - sizeof(u32_t);
+                            int data_len = recv_buf.len < (int)(buf_len - sizeof(u32_t)) ? recv_buf.len : (int)(buf_len - sizeof(u32_t));
+                            server_process_remote_packet(conn_info, fd64, data, data_len);
+                        }
+                    }
+                    uring_recycle_buf(ctx, recv_buf.buf_id);
+                }
+            }
+
+            if (!more) {
+                if (type == URING_TAG_SERVER_LOCAL) {
+                    uring_add_multishot_recvmsg(ctx, local_listen_fd, cqe->user_data);
+                    need_submit = 1;
+                } else if (type == URING_TAG_SERVER_REMOTE) {
+                    fd64_t fd64 = (fd64_t)uring_tag_payload(cqe->user_data);
+                    if (fd_manager.exist(fd64)) {
+                        uring_add_multishot_recv(ctx, fd_manager.to_fd(fd64), cqe->user_data);
+                        need_submit = 1;
+                    }
+                }
+            }
+        }
+
+        /* Single batched advance + buffer commit */
+        uring_cq_advance(ctx, ready);
+        uring_buf_ring_commit(ctx);
+
+        /* Submit any re-arms and flush deferred completions in one syscall */
+        if (need_submit)
+            uring_submit_and_flush(ctx);
+        else
+            uring_flush(ctx);
+    }
+}
+
+static void server_uring_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) {
+    assert(!(revents & EV_ERROR));
+    server_uring_drain(loop);
+}
+#endif
+
 int tunnel_server_event_loop() {
     int i, j, k;
     int ret;
@@ -349,17 +461,27 @@ int tunnel_server_event_loop() {
     //	mylog(log_fatal,"add  udp_listen_fd error\n");
     //	myexit(-1);
     // }
+    int use_uring = 0;
+#ifdef __linux__
+    server_local_listen_fd = local_listen_fd;
+    if (uring_init(&server_uring_ctx, 64, 256, buf_len) == 0) {
+        g_uring_ctx = &server_uring_ctx;
+        static struct ev_io uring_watcher;
+        ev_io_init(&uring_watcher, server_uring_cb, server_uring_ctx.ring_fd, EV_READ);
+        ev_io_start(loop, &uring_watcher);
+
+        uring_add_multishot_recvmsg(&server_uring_ctx, local_listen_fd,
+                                      uring_tag(URING_TAG_SERVER_LOCAL, 0));
+        uring_submit(&server_uring_ctx);
+        use_uring = 1;
+        mylog(log_info, "io_uring: active for server sockets\n");
+    }
+#endif
+
     struct ev_io local_listen_watcher;
     ev_io_init(&local_listen_watcher, local_listen_cb, local_listen_fd, EV_READ);
-    ev_io_start(loop, &local_listen_watcher);
-
-    // ev.events = EPOLLIN;
-    // ev.data.u64 = delay_manager.get_timer_fd();
-    // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, delay_manager.get_timer_fd(), &ev);
-    // if (ret!= 0) {
-    //	mylog(log_fatal,"add delay_manager.get_timer_fd() error\n");
-    //	myexit(-1);
-    // }
+    if (!use_uring)
+        ev_io_start(loop, &local_listen_watcher);
 
     delay_manager.set_loop_and_cb(loop, delay_manager_cb);
 
diff --git a/xor_spe.S b/xor_spe.S
new file mode 100644
index 0000000..d00ca81
--- /dev/null
+++ b/xor_spe.S
@@ -0,0 +1,159 @@
+/*
+ * xor_spe.S — SPE-accelerated XOR for PowerPC e500v2
+ *
+ * Uses SPE (Signal Processing Extension) 64-bit operations:
+ *   evldd/evstdd: 8-byte aligned load/store
+ *   evxor: 64-bit XOR
+ *
+ * GCC 9+ removed SPE intrinsics, but gas still supports the opcodes.
+ * Follows the Linux kernel pattern (arch/powerpc/crypto/aes-spe-core.S).
+ *
+ * Requires CONFIG_SPE=y in the kernel for context save/restore of the
+ * upper 32 bits of GPRs.
+ */
+
+/* Mark stack as non-executable (suppresses ld warning) */
+.section .note.GNU-stack,"",@progbits
+
+#ifdef HAVE_PPC_SPE
+
+.section .text
+.globl xor_tile_spe
+.type xor_tile_spe, @function
+
+/*
+ * void xor_tile_spe(char *data, int len, const char *tile, int tile_len)
+ *
+ * XOR data[0..len-1] with tile[0..tile_len-1] repeating.
+ *
+ * REQUIREMENTS (enforced by C caller in packet_cook.cpp):
+ *   - data MUST be 8-byte aligned
+ *   - tile MUST be 8-byte aligned
+ *   - tile_len MUST be a multiple of 8 (COOK_VEC_WIDTH)
+ *   - Tile offset always starts at 0 (caller pre-rotates if needed)
+ *
+ * PPC calling convention:
+ *   r3 = data pointer (8-byte aligned)
+ *   r4 = len
+ *   r5 = tile pointer (8-byte aligned)
+ *   r6 = tile_len (multiple of 8)
+ *
+ * Clobbers: r0, r7, r8, r9, r10, r11, r12, ctr
+ * Uses SPE: upper halves of r7, r8, r9, r10, r11, r12
+ */
+.align 4
+xor_tile_spe:
+	/* Return immediately if len <= 0 */
+	cmpwi	%r4, 0
+	blelr
+
+	/* r0 = tile offset (always starts at 0) */
+	li	%r0, 0
+
+	/* Process 32 bytes per iteration (4x evldd). */
+	srwi.	%r7, %r4, 5	/* r7 = len / 32 = iteration count */
+	beq	.Ltail8		/* < 32 bytes remaining */
+	mtctr	%r7
+
+.Lmain_loop:
+	/* Compute tile pointer: r8 = tile + offset */
+	add	%r8, %r5, %r0
+
+	/* Load 4 doublewords from data */
+	evldd	%r7, 0(%r3)
+	evldd	%r9, 8(%r3)
+	evldd	%r11, 16(%r3)
+
+	/* Load and XOR first tile doubleword */
+	evldd	%r10, 0(%r8)
+	evxor	%r7, %r7, %r10
+
+	/* Advance tile offset, check wrap for each doubleword */
+	addic	%r0, %r0, 8
+	cmpw	%r0, %r6
+	blt	.Lnw1
+	li	%r0, 0
+.Lnw1:
+	add	%r8, %r5, %r0
+	evldd	%r10, 0(%r8)
+	evxor	%r9, %r9, %r10
+
+	addic	%r0, %r0, 8
+	cmpw	%r0, %r6
+	blt	.Lnw2
+	li	%r0, 0
+.Lnw2:
+	add	%r8, %r5, %r0
+	evldd	%r10, 0(%r8)
+	evxor	%r11, %r11, %r10
+
+	addic	%r0, %r0, 8
+	cmpw	%r0, %r6
+	blt	.Lnw3
+	li	%r0, 0
+.Lnw3:
+	add	%r8, %r5, %r0
+	evldd	%r12, 24(%r3)
+	evldd	%r10, 0(%r8)
+	evxor	%r12, %r12, %r10
+
+	addic	%r0, %r0, 8
+	cmpw	%r0, %r6
+	blt	.Lnw4
+	li	%r0, 0
+.Lnw4:
+
+	/* Store 4 XORed doublewords */
+	evstdd	%r7, 0(%r3)
+	evstdd	%r9, 8(%r3)
+	evstdd	%r11, 16(%r3)
+	evstdd	%r12, 24(%r3)
+
+	addi	%r3, %r3, 32
+	bdnz	.Lmain_loop
+
+.Ltail8:
+	/* Process remaining 8-byte chunks */
+	andi.	%r7, %r4, 24	/* r7 = (len % 32) & ~7 = remaining 8-byte blocks * 8 */
+	beq	.Ltail1
+	srwi	%r7, %r7, 3
+	mtctr	%r7
+
+.Ltail8_loop:
+	add	%r8, %r5, %r0
+	evldd	%r7, 0(%r3)
+	evldd	%r10, 0(%r8)
+	evxor	%r7, %r7, %r10
+	evstdd	%r7, 0(%r3)
+	addi	%r3, %r3, 8
+	addic	%r0, %r0, 8
+	cmpw	%r0, %r6
+	blt	.Ltail8_nowrap
+	li	%r0, 0
+.Ltail8_nowrap:
+	bdnz	.Ltail8_loop
+
+.Ltail1:
+	/* Process remaining bytes (0-7) */
+	andi.	%r7, %r4, 7
+	beqlr
+	mtctr	%r7
+
+.Ltail1_loop:
+	lbz	%r8, 0(%r3)
+	lbzx	%r9, %r5, %r0
+	xor	%r8, %r8, %r9
+	stb	%r8, 0(%r3)
+	addi	%r3, %r3, 1
+	addic	%r0, %r0, 1
+	cmpw	%r0, %r6
+	blt	.Ltail1_nowrap
+	li	%r0, 0
+.Ltail1_nowrap:
+	bdnz	.Ltail1_loop
+
+	blr
+
+.size xor_tile_spe, .-xor_tile_spe
+
+#endif /* HAVE_PPC_SPE */